chore: update flag spider & add item loader for processing

This commit is contained in:
2022-06-26 22:08:27 +01:00
parent 9ef6f94516
commit ceb7aa5b08
5 changed files with 26 additions and 12 deletions

View File

@@ -0,0 +1,12 @@
from itemloaders.processors import TakeFirst, MapCompose
from scrapy.loader import ItemLoader
from w3lib.html import remove_tags
class FlagsItemLoader(ItemLoader):
default_output_processor = TakeFirst()
# country_name - convert "_" to " "
country_name_in = MapCompose(lambda country: country.replace("_", " "))
# flag_description - remove html tags
flag_description = MapCompose(remove_tags)

View File

@@ -33,7 +33,7 @@ class CapitalsItem(scrapy.Item):
class FlagsItem(scrapy.Item): class FlagsItem(scrapy.Item):
country_name = scrapy.Field() country_name = scrapy.Field()
flag_description_html = scrapy.Field() flag_description = scrapy.Field()
file_urls = scrapy.Field() file_urls = scrapy.Field()
files = scrapy.Field() files = scrapy.Field()

View File

@@ -33,12 +33,7 @@ class AnthemDownloadFilesPipeline(FilesPipeline):
flag_filename = re.search(r"([^\/]*)$", request.url) flag_filename = re.search(r"([^\/]*)$", request.url)
if isinstance(flag_filename, re.Match): if isinstance(flag_filename, re.Match):
if ( if (filename := flag_filename[1]).endswith(("ogg", "oga", "mp3", "wav")):
(filename := flag_filename[1]).endswith("ogg")
or filename.endswith("oga")
or filename.endswith("mp3")
or filename.endswith("wav")
):
return f"files/anthems/{filename}" return f"files/anthems/{filename}"
@@ -47,5 +42,5 @@ class FlagDownloadFilesPipeline(FilesPipeline):
flag_filename = re.search(r"([^\/]*)$", request.url) flag_filename = re.search(r"([^\/]*)$", request.url)
if isinstance(flag_filename, re.Match): if isinstance(flag_filename, re.Match):
if (filename := flag_filename[1]).endswith(".svg"): if (filename := flag_filename[1]).endswith(("svg", "png")):
return f"files/flags/{filename}" return f"files/flags/{filename}"

View File

@@ -3,9 +3,10 @@ import pathlib
import scrapy import scrapy
from scrapy.http import TextResponse from scrapy.http import TextResponse
from wikipedia_country_scraper.items import FlagsItem from wikipedia_country_scraper.items import FlagsItem
from wikipedia_country_scraper.itemloaders import FlagsItemLoader
EXCLUDED = ["Réunion"] EXCLUDED = ["Réunion", "Svalbard", "Mayotte", "Guadeloupe", "French_Guiana"]
class FlagsSpider(scrapy.Spider): class FlagsSpider(scrapy.Spider):
@@ -79,9 +80,11 @@ class FlagsSpider(scrapy.Spider):
def get_flag_image(self, response: TextResponse, country_name: str, flag_description_html: str): def get_flag_image(self, response: TextResponse, country_name: str, flag_description_html: str):
flag_image_result = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get() flag_image_result = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get()
flags_item = FlagsItem() flags_item_loader = FlagsItemLoader(item=FlagsItem())
flags_item["country_name"] = country_name flags_item_loader.add_value("country_name", country_name)
flags_item["flag_description_html"] = flag_description_html flags_item_loader.add_value("flag_description", flag_description_html)
flags_item = flags_item_loader.load_item()
flags_item["file_urls"] = [f"https:{flag_image_result}"] flags_item["file_urls"] = [f"https:{flag_image_result}"]
yield flags_item yield flags_item

View File

@@ -31,6 +31,10 @@ Exporting JSON:
Setting exports per spider: Setting exports per spider:
<https://stackoverflow.com/a/53322959> <https://stackoverflow.com/a/53322959>
Processing using item loaders + pipelines:
<https://thepythonscrapyplaybook.com/scrapy-beginners-guide-cleaning-data/#pre-processing-data-with-scrapy-item-loaders>
<https://docs.scrapy.org/en/latest/topics/loaders.html>
### new project ### new project
``` ```