chore: change anthem download to .ogg instead of .mp3

This commit is contained in:
2022-06-22 23:19:49 +01:00
parent c781e337b8
commit 5581762c39
6 changed files with 5500 additions and 3 deletions

View File

@@ -1,2 +1,3 @@
#!/bin/bash
source .venv/bin/activate
scrapy crawl CountrydownloaderSpider

View File

@@ -24,5 +24,5 @@ class WikipediaCountryScraperFilesPipeline(FilesPipeline):
if isinstance(flag_filename, re.Match):
if (filename := flag_filename[1]).endswith(".svg"):
return f"files/flags/{filename}"
elif filename.endswith(".ogg.mp3"):
elif filename.endswith(".ogg"):
return f"files/anthems/{filename}"

View File

@@ -50,7 +50,7 @@ class CountrydownloaderSpider(scrapy.Spider):
).get()
anthem_file_url = response.xpath(
"//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//source[@data-title='MP3']/@src"
"//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//source[contains(@type, 'audio/ogg')]/@src"
).get()
anthem_item = response.xpath(
"//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]"
@@ -94,7 +94,6 @@ class CountrydownloaderSpider(scrapy.Spider):
country_item = {**country_item, "flag_image_url": f"https:{flag_image_xpath}"}
country_scrapy_item = WikipediaCountryScraperItem()
print(f"ANTHEM: {country_item['anthem']}")
country_scrapy_item["country_url"] = country_item["country_url"]
country_scrapy_item["short_country_name"] = country_item["short_country_name"]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,19 @@
[tool.poetry]
name = "downloaded_data_inspection"
version = "0.1.0"
description = ""
authors = ["Daniel Tomlinson <dtomlinson@panaetius.co.uk>"]
[tool.poetry.dependencies]
python = "^3.8"
notebook = "^6.4.12"
pandas = "^1.4.2"
jupyterthemes = "^0.20.0"
jupyter-contrib-nbextensions = "^0.5.1"
jupyter-resource-usage = "^0.6.1"
[tool.poetry.dev-dependencies]
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"