chore: add anthems spider
This commit is contained in:
3
01_scrapy/wikipedia_country_scraper/download_anthems.sh
Executable file
3
01_scrapy/wikipedia_country_scraper/download_anthems.sh
Executable file
@@ -0,0 +1,3 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
source .venv/bin/activate
|
||||||
|
scrapy crawl AnthemsSpider
|
||||||
@@ -15,3 +15,12 @@ class WikipediaCountryScraperItem(scrapy.Item):
|
|||||||
|
|
||||||
file_urls = scrapy.Field()
|
file_urls = scrapy.Field()
|
||||||
files = scrapy.Field()
|
files = scrapy.Field()
|
||||||
|
|
||||||
|
|
||||||
|
class AnthemsItem(scrapy.Item):
|
||||||
|
country_name = scrapy.Field()
|
||||||
|
native_anthem_title = scrapy.Field()
|
||||||
|
english_title = scrapy.Field()
|
||||||
|
|
||||||
|
file_urls = scrapy.Field()
|
||||||
|
files = scrapy.Field()
|
||||||
|
|||||||
@@ -26,3 +26,12 @@ class WikipediaCountryScraperFilesPipeline(FilesPipeline):
|
|||||||
return f"files/flags/{filename}"
|
return f"files/flags/{filename}"
|
||||||
elif filename.endswith(".ogg") or filename.endswith("oga"):
|
elif filename.endswith(".ogg") or filename.endswith("oga"):
|
||||||
return f"files/anthems/{filename}"
|
return f"files/anthems/{filename}"
|
||||||
|
|
||||||
|
|
||||||
|
class AnthemDownloadFilesPipeline(FilesPipeline):
|
||||||
|
def file_path(self, request, response=None, info=None, *, item=None):
|
||||||
|
flag_filename = re.search(r"([^\/]*)$", request.url)
|
||||||
|
|
||||||
|
if isinstance(flag_filename, re.Match):
|
||||||
|
if (filename := flag_filename[1]).endswith("ogg") or filename.endswith("oga"):
|
||||||
|
return f"files/anthems/{filename}"
|
||||||
|
|||||||
@@ -65,10 +65,10 @@ DOWNLOADER_MIDDLEWARES = {
|
|||||||
|
|
||||||
# Configure item pipelines
|
# Configure item pipelines
|
||||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
ITEM_PIPELINES = {
|
# ITEM_PIPELINES = {
|
||||||
"wikipedia_country_scraper.pipelines.WikipediaCountryScraperFilesPipeline": 300,
|
# "wikipedia_country_scraper.pipelines.WikipediaCountryScraperFilesPipeline": 300,
|
||||||
# "scrapy.pipelines.files.FilesPipeline": 1
|
# # "scrapy.pipelines.files.FilesPipeline": 1
|
||||||
}
|
# }
|
||||||
FILES_STORE = str(pathlib.Path(__file__).resolve().parents[3] / "data" / "scrapy" / "raw_country_data")
|
FILES_STORE = str(pathlib.Path(__file__).resolve().parents[3] / "data" / "scrapy" / "raw_country_data")
|
||||||
|
|
||||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||||
@@ -92,10 +92,10 @@ FILES_STORE = str(pathlib.Path(__file__).resolve().parents[3] / "data" / "scrapy
|
|||||||
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||||
|
|
||||||
FEEDS = {
|
# FEEDS = {
|
||||||
pathlib.Path(__file__).resolve().parents[3]
|
# pathlib.Path(__file__).resolve().parents[3]
|
||||||
/ "data"
|
# / "data"
|
||||||
/ "scrapy"
|
# / "scrapy"
|
||||||
/ "raw_country_data"
|
# / "raw_country_data"
|
||||||
/ "countries.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
|
# / "countries.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,65 @@
|
|||||||
|
import pathlib
|
||||||
|
import re
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
from scrapy.http import TextResponse
|
||||||
|
|
||||||
|
from wikipedia_country_scraper.items import AnthemsItem
|
||||||
|
|
||||||
|
|
||||||
|
class AnthemsSpider(scrapy.Spider):
|
||||||
|
name = "anthems"
|
||||||
|
start_urls = ["https://en.wikipedia.org/wiki/List_of_national_anthems"]
|
||||||
|
custom_settings = {
|
||||||
|
"ITEM_PIPELINES": {"wikipedia_country_scraper.pipelines.AnthemDownloadFilesPipeline": 100},
|
||||||
|
"FEEDS": {
|
||||||
|
pathlib.Path(__file__).resolve().parents[4]
|
||||||
|
/ "data"
|
||||||
|
/ "scrapy"
|
||||||
|
/ "raw_country_data"
|
||||||
|
/ "anthems.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def parse(self, response: TextResponse):
|
||||||
|
country_names = []
|
||||||
|
native_anthem_titles = []
|
||||||
|
english_titles = []
|
||||||
|
anthem_urls = []
|
||||||
|
|
||||||
|
_country_names = response.xpath("//table[contains(@class, 'wikitable')]/tbody/tr/th[1]")
|
||||||
|
for link in _country_names:
|
||||||
|
if (country_name := link.xpath("a/text()").get()) is not None:
|
||||||
|
country_names.append(country_name)
|
||||||
|
|
||||||
|
_native_anthem_titles = response.xpath("//table[contains(@class, 'wikitable')]/tbody/tr/td[1]")
|
||||||
|
for link in _native_anthem_titles:
|
||||||
|
titles = link.xpath("a/text()").getall()
|
||||||
|
native_anthem_title = titles[0] if len(titles) == 0 else "\n".join(titles)
|
||||||
|
native_anthem_titles.append(native_anthem_title)
|
||||||
|
|
||||||
|
for link in _native_anthem_titles:
|
||||||
|
if (english_title := link.xpath("small/text()").get()) is not None:
|
||||||
|
english_titles.append(re.search(r"(?:[\W]*)(?P<title>[^\"]*)", english_title)["title"])
|
||||||
|
else:
|
||||||
|
english_titles.append(None)
|
||||||
|
|
||||||
|
_country_names = response.xpath("//table[contains(@class, 'wikitable')]/tbody")
|
||||||
|
for index, link in enumerate(_country_names):
|
||||||
|
if index == 0:
|
||||||
|
recognised_countries = link.xpath("tr/td[5]")
|
||||||
|
anthem_urls.extend(anthem_url.xpath("a/@href").get() for anthem_url in recognised_countries)
|
||||||
|
elif index == 1:
|
||||||
|
partially_recognised_countries = link.xpath("tr/td[6]")
|
||||||
|
anthem_urls.extend(anthem_url.xpath("a/@href").get() for anthem_url in partially_recognised_countries)
|
||||||
|
|
||||||
|
for country_name, native_anthem_title, english_title, anthem_url in zip(
|
||||||
|
country_names, native_anthem_titles, english_titles, anthem_urls
|
||||||
|
):
|
||||||
|
anthem_item = AnthemsItem()
|
||||||
|
anthem_item["country_name"] = country_name
|
||||||
|
anthem_item["native_anthem_title"] = native_anthem_title
|
||||||
|
anthem_item["english_title"] = english_title
|
||||||
|
anthem_item["file_urls"] = [f"https://en.wikipedia.org{anthem_url}" if anthem_url is not None else None]
|
||||||
|
|
||||||
|
yield anthem_item
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pathlib
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import scrapy
|
import scrapy
|
||||||
@@ -10,6 +11,16 @@ from wikipedia_country_scraper.items import WikipediaCountryScraperItem
|
|||||||
|
|
||||||
class CountrydownloaderSpider(scrapy.Spider):
|
class CountrydownloaderSpider(scrapy.Spider):
|
||||||
name = "CountrydownloaderSpider"
|
name = "CountrydownloaderSpider"
|
||||||
|
custom_settings = {
|
||||||
|
"ITEM_PIPELINES": {"wikipedia_country_scraper.pipelines.WikipediaCountryScraperFilesPipeline": 100},
|
||||||
|
"FEEDS": {
|
||||||
|
pathlib.Path(__file__).resolve().parents[4]
|
||||||
|
/ "data"
|
||||||
|
/ "scrapy"
|
||||||
|
/ "raw_country_data"
|
||||||
|
/ "countries.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
def start_requests(self):
|
def start_requests(self):
|
||||||
return [
|
return [
|
||||||
|
|||||||
@@ -23,9 +23,13 @@ Using selectors:
|
|||||||
|
|
||||||
Download files/images:
|
Download files/images:
|
||||||
<https://docs.scrapy.org/en/latest/topics/media-pipeline.html>
|
<https://docs.scrapy.org/en/latest/topics/media-pipeline.html>
|
||||||
|
Setting pipelines per spider:
|
||||||
|
<https://stackoverflow.com/a/34647090>
|
||||||
|
|
||||||
Exporting JSON:
|
Exporting JSON:
|
||||||
<https://docs.scrapy.org/en/latest/topics/feed-exports.html#std-setting-FEEDS>
|
<https://docs.scrapy.org/en/latest/topics/feed-exports.html#std-setting-FEEDS>
|
||||||
|
Setting exports per spider:
|
||||||
|
<https://stackoverflow.com/a/53322959>
|
||||||
|
|
||||||
### new project
|
### new project
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user