chore: add initial scrapy code
This commit is contained in:
@@ -0,0 +1,28 @@
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
|
||||
import re
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
from itemadapter import ItemAdapter
|
||||
from scrapy.pipelines.files import FilesPipeline
|
||||
|
||||
|
||||
class WikipediaCountryScraperPipeline:
|
||||
def process_item(self, item, spider):
|
||||
return item
|
||||
|
||||
|
||||
class WikipediaCountryScraperFilesPipeline(FilesPipeline):
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
print(f"request URLs: {request.url}")
|
||||
flag_filename = re.search(r"([^\/]*)$", request.url)
|
||||
anthem_filename = re.search(r"(?P<filename>(?<=File\:)[^\$]*)", request.url)
|
||||
|
||||
if isinstance(flag_filename, re.Match):
|
||||
return f"files/{flag_filename[1]}"
|
||||
elif anthem_filename["filename"].endswith(".mp3"):
|
||||
return f"files/{anthem_filename['filename']}"
|
||||
Reference in New Issue
Block a user