chore: add initial scrapy code

This commit is contained in:
2022-06-22 20:39:31 +01:00
parent 5d0c7b19fa
commit e49fa7a346
11 changed files with 1612 additions and 0 deletions

View File

@@ -0,0 +1,28 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import re
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from scrapy.pipelines.files import FilesPipeline
class WikipediaCountryScraperPipeline:
def process_item(self, item, spider):
return item
class WikipediaCountryScraperFilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
print(f"request URLs: {request.url}")
flag_filename = re.search(r"([^\/]*)$", request.url)
anthem_filename = re.search(r"(?P<filename>(?<=File\:)[^\$]*)", request.url)
if isinstance(flag_filename, re.Match):
return f"files/{flag_filename[1]}"
elif anthem_filename["filename"].endswith(".mp3"):
return f"files/{anthem_filename['filename']}"