Files
geography-anki/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/pipelines.py

53 lines
2.1 KiB
Python

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import re
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from scrapy.pipelines.files import FilesPipeline
class WikipediaCountryScraperPipeline:
def process_item(self, item, spider):
return item
class WikipediaCountryScraperFilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
print(f"request URLs: {request.url}")
flag_filename = re.search(r"([^\/]*)$", request.url)
if isinstance(flag_filename, re.Match):
if (filename := flag_filename[1]).endswith(".svg"):
return f"files/flags/{filename}"
elif filename.endswith(".ogg") or filename.endswith("oga"):
return f"files/anthems/{filename}"
class AnthemDownloadFilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
flag_filename = re.search(r"([^\/]*)$", request.url)
if isinstance(flag_filename, re.Match):
if (filename := flag_filename[1]).endswith(("ogg", "oga", "mp3", "wav")):
return f"files/anthems/{filename}"
class FlagDownloadFilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
flag_filename = re.search(r"([^\/]*)$", request.url)
if isinstance(flag_filename, re.Match):
if (filename := flag_filename[1]).endswith(("svg", "png")):
# filename = filename.replace("%27", "'")
# filename = filename.replace("%28", "(")
# filename = filename.replace("%29", ")")
filename = filename.replace("%C3%A7", "ç")
filename = filename.replace("%C3%B1", "ñ")
filename = filename.replace("%C3%B4", "ô")
filename = filename.replace("%C3%85", "Å")
return f"files/flags/{filename}"