chore: update flag spider & add item loader for processing
This commit is contained in:
@@ -0,0 +1,12 @@
|
||||
from itemloaders.processors import TakeFirst, MapCompose
|
||||
from scrapy.loader import ItemLoader
|
||||
from w3lib.html import remove_tags
|
||||
|
||||
class FlagsItemLoader(ItemLoader):
|
||||
default_output_processor = TakeFirst()
|
||||
|
||||
# country_name - convert "_" to " "
|
||||
country_name_in = MapCompose(lambda country: country.replace("_", " "))
|
||||
|
||||
# flag_description - remove html tags
|
||||
flag_description = MapCompose(remove_tags)
|
||||
@@ -33,7 +33,7 @@ class CapitalsItem(scrapy.Item):
|
||||
|
||||
class FlagsItem(scrapy.Item):
|
||||
country_name = scrapy.Field()
|
||||
flag_description_html = scrapy.Field()
|
||||
flag_description = scrapy.Field()
|
||||
|
||||
file_urls = scrapy.Field()
|
||||
files = scrapy.Field()
|
||||
|
||||
@@ -33,12 +33,7 @@ class AnthemDownloadFilesPipeline(FilesPipeline):
|
||||
flag_filename = re.search(r"([^\/]*)$", request.url)
|
||||
|
||||
if isinstance(flag_filename, re.Match):
|
||||
if (
|
||||
(filename := flag_filename[1]).endswith("ogg")
|
||||
or filename.endswith("oga")
|
||||
or filename.endswith("mp3")
|
||||
or filename.endswith("wav")
|
||||
):
|
||||
if (filename := flag_filename[1]).endswith(("ogg", "oga", "mp3", "wav")):
|
||||
return f"files/anthems/{filename}"
|
||||
|
||||
|
||||
@@ -47,5 +42,5 @@ class FlagDownloadFilesPipeline(FilesPipeline):
|
||||
flag_filename = re.search(r"([^\/]*)$", request.url)
|
||||
|
||||
if isinstance(flag_filename, re.Match):
|
||||
if (filename := flag_filename[1]).endswith(".svg"):
|
||||
if (filename := flag_filename[1]).endswith(("svg", "png")):
|
||||
return f"files/flags/{filename}"
|
||||
|
||||
@@ -3,9 +3,10 @@ import pathlib
|
||||
import scrapy
|
||||
from scrapy.http import TextResponse
|
||||
from wikipedia_country_scraper.items import FlagsItem
|
||||
from wikipedia_country_scraper.itemloaders import FlagsItemLoader
|
||||
|
||||
|
||||
EXCLUDED = ["Réunion"]
|
||||
EXCLUDED = ["Réunion", "Svalbard", "Mayotte", "Guadeloupe", "French_Guiana"]
|
||||
|
||||
|
||||
class FlagsSpider(scrapy.Spider):
|
||||
@@ -79,9 +80,11 @@ class FlagsSpider(scrapy.Spider):
|
||||
def get_flag_image(self, response: TextResponse, country_name: str, flag_description_html: str):
|
||||
flag_image_result = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get()
|
||||
|
||||
flags_item = FlagsItem()
|
||||
flags_item["country_name"] = country_name
|
||||
flags_item["flag_description_html"] = flag_description_html
|
||||
flags_item_loader = FlagsItemLoader(item=FlagsItem())
|
||||
flags_item_loader.add_value("country_name", country_name)
|
||||
flags_item_loader.add_value("flag_description", flag_description_html)
|
||||
|
||||
flags_item = flags_item_loader.load_item()
|
||||
flags_item["file_urls"] = [f"https:{flag_image_result}"]
|
||||
|
||||
yield flags_item
|
||||
|
||||
@@ -31,6 +31,10 @@ Exporting JSON:
|
||||
Setting exports per spider:
|
||||
<https://stackoverflow.com/a/53322959>
|
||||
|
||||
Processing using item loaders + pipelines:
|
||||
<https://thepythonscrapyplaybook.com/scrapy-beginners-guide-cleaning-data/#pre-processing-data-with-scrapy-item-loaders>
|
||||
<https://docs.scrapy.org/en/latest/topics/loaders.html>
|
||||
|
||||
### new project
|
||||
|
||||
```
|
||||
|
||||
Reference in New Issue
Block a user