From ceb7aa5b083c9f25200343a1d77e76cf20a4e0c9 Mon Sep 17 00:00:00 2001 From: Daniel Tomlinson Date: Sun, 26 Jun 2022 22:08:27 +0100 Subject: [PATCH] chore: update flag spider & add item loader for processing --- .../wikipedia_country_scraper/itemloaders.py | 12 ++++++++++++ .../wikipedia_country_scraper/items.py | 2 +- .../wikipedia_country_scraper/pipelines.py | 9 ++------- .../wikipedia_country_scraper/spiders/flags.py | 11 +++++++---- docs/scraping.md | 4 ++++ 5 files changed, 26 insertions(+), 12 deletions(-) create mode 100644 01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/itemloaders.py diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/itemloaders.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/itemloaders.py new file mode 100644 index 0000000..4cb2a6d --- /dev/null +++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/itemloaders.py @@ -0,0 +1,12 @@ +from itemloaders.processors import TakeFirst, MapCompose +from scrapy.loader import ItemLoader +from w3lib.html import remove_tags + +class FlagsItemLoader(ItemLoader): + default_output_processor = TakeFirst() + + # country_name - convert "_" to " " + country_name_in = MapCompose(lambda country: country.replace("_", " ")) + + # flag_description - remove html tags + flag_description = MapCompose(remove_tags) diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py index 2926b42..6813cc7 100644 --- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py +++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py @@ -33,7 +33,7 @@ class CapitalsItem(scrapy.Item): class FlagsItem(scrapy.Item): country_name = scrapy.Field() - flag_description_html = scrapy.Field() + flag_description = scrapy.Field() file_urls = scrapy.Field() files = scrapy.Field() diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/pipelines.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/pipelines.py index e4025d0..680fbaa 100644 --- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/pipelines.py +++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/pipelines.py @@ -33,12 +33,7 @@ class AnthemDownloadFilesPipeline(FilesPipeline): flag_filename = re.search(r"([^\/]*)$", request.url) if isinstance(flag_filename, re.Match): - if ( - (filename := flag_filename[1]).endswith("ogg") - or filename.endswith("oga") - or filename.endswith("mp3") - or filename.endswith("wav") - ): + if (filename := flag_filename[1]).endswith(("ogg", "oga", "mp3", "wav")): return f"files/anthems/{filename}" @@ -47,5 +42,5 @@ class FlagDownloadFilesPipeline(FilesPipeline): flag_filename = re.search(r"([^\/]*)$", request.url) if isinstance(flag_filename, re.Match): - if (filename := flag_filename[1]).endswith(".svg"): + if (filename := flag_filename[1]).endswith(("svg", "png")): return f"files/flags/{filename}" diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/flags.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/flags.py index b4fac11..c0dbe2e 100644 --- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/flags.py +++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/flags.py @@ -3,9 +3,10 @@ import pathlib import scrapy from scrapy.http import TextResponse from wikipedia_country_scraper.items import FlagsItem +from wikipedia_country_scraper.itemloaders import FlagsItemLoader -EXCLUDED = ["Réunion"] +EXCLUDED = ["Réunion", "Svalbard", "Mayotte", "Guadeloupe", "French_Guiana"] class FlagsSpider(scrapy.Spider): @@ -79,9 +80,11 @@ class FlagsSpider(scrapy.Spider): def get_flag_image(self, response: TextResponse, country_name: str, flag_description_html: str): flag_image_result = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get() - flags_item = FlagsItem() - flags_item["country_name"] = country_name - flags_item["flag_description_html"] = flag_description_html + flags_item_loader = FlagsItemLoader(item=FlagsItem()) + flags_item_loader.add_value("country_name", country_name) + flags_item_loader.add_value("flag_description", flag_description_html) + + flags_item = flags_item_loader.load_item() flags_item["file_urls"] = [f"https:{flag_image_result}"] yield flags_item diff --git a/docs/scraping.md b/docs/scraping.md index 9f16bb5..e7a31e3 100644 --- a/docs/scraping.md +++ b/docs/scraping.md @@ -31,6 +31,10 @@ Exporting JSON: Setting exports per spider: +Processing using item loaders + pipelines: + + + ### new project ```