chore: remove anthem from scraper
This commit is contained in:
@@ -9,11 +9,8 @@ import scrapy
|
|||||||
class WikipediaCountryScraperItem(scrapy.Item):
|
class WikipediaCountryScraperItem(scrapy.Item):
|
||||||
country_url = scrapy.Field()
|
country_url = scrapy.Field()
|
||||||
short_country_name = scrapy.Field()
|
short_country_name = scrapy.Field()
|
||||||
country = scrapy.Field()
|
country_html = scrapy.Field()
|
||||||
flag_description = scrapy.Field()
|
flag_html = scrapy.Field()
|
||||||
anthem = scrapy.Field()
|
|
||||||
anthem_url = scrapy.Field()
|
|
||||||
anthem_file_url = scrapy.Field()
|
|
||||||
|
|
||||||
file_urls = scrapy.Field()
|
file_urls = scrapy.Field()
|
||||||
files = scrapy.Field()
|
files = scrapy.Field()
|
||||||
|
|||||||
@@ -19,13 +19,15 @@ class CountrydownloaderSpider(scrapy.Spider):
|
|||||||
]
|
]
|
||||||
|
|
||||||
def extract_country_urls(self, response: TextResponse):
|
def extract_country_urls(self, response: TextResponse):
|
||||||
|
"""Extract urls of all countries from https://en.wikipedia.org/wiki/List_of_sovereign_states."""
|
||||||
|
|
||||||
country_urls_xpath = response.xpath(
|
country_urls_xpath = response.xpath(
|
||||||
"//table[contains(@class, 'sortable') and contains(@class, 'wikitable')]/tbody/tr[not(contains(@style, 'background'))]/td[1 and contains(@style, 'vertical-align:top;')]/b/a/@href"
|
"//table[contains(@class, 'sortable') and contains(@class, 'wikitable')]/tbody/tr[not(contains(@style, 'background'))]/td[1 and contains(@style, 'vertical-align:top;')]/b/a/@href"
|
||||||
).getall()
|
).getall()
|
||||||
|
|
||||||
for url in country_urls_xpath:
|
for url in country_urls_xpath:
|
||||||
# for url in country_urls_xpath[:3]:
|
|
||||||
regex_match = re.search(r"\/wiki\/(?P<short_country_name>[^$]*)", url)
|
regex_match = re.search(r"\/wiki\/(?P<short_country_name>[^$]*)", url)
|
||||||
|
|
||||||
yield scrapy.Request(
|
yield scrapy.Request(
|
||||||
url=f"https://en.wikipedia.org{url}",
|
url=f"https://en.wikipedia.org{url}",
|
||||||
callback=self.extract_country_information,
|
callback=self.extract_country_information,
|
||||||
@@ -40,6 +42,8 @@ class CountrydownloaderSpider(scrapy.Spider):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def extract_country_information(self, response: TextResponse, country_item: dict):
|
def extract_country_information(self, response: TextResponse, country_item: dict):
|
||||||
|
"""Extract html of country sidebar on each country page."""
|
||||||
|
|
||||||
country_information_xpath = response.xpath("//table[contains(@class, 'infobox')]/tbody/tr").getall()
|
country_information_xpath = response.xpath("//table[contains(@class, 'infobox')]/tbody/tr").getall()
|
||||||
|
|
||||||
flag_image_url = response.xpath(
|
flag_image_url = response.xpath(
|
||||||
@@ -49,13 +53,9 @@ class CountrydownloaderSpider(scrapy.Spider):
|
|||||||
"//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[2]/a/@href"
|
"//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[2]/a/@href"
|
||||||
).get()
|
).get()
|
||||||
|
|
||||||
anthem_page_url = response.xpath(
|
|
||||||
"//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//span[contains(@class, 'audio')]/a/@href"
|
|
||||||
).get()
|
|
||||||
|
|
||||||
country_item = {
|
country_item = {
|
||||||
**country_item,
|
**country_item,
|
||||||
"country": country_information_xpath,
|
"country_html": country_information_xpath,
|
||||||
}
|
}
|
||||||
|
|
||||||
yield scrapy.Request(
|
yield scrapy.Request(
|
||||||
@@ -65,16 +65,18 @@ class CountrydownloaderSpider(scrapy.Spider):
|
|||||||
"country_item": country_item,
|
"country_item": country_item,
|
||||||
"urls": {
|
"urls": {
|
||||||
"flag_image_url": f"https://en.wikipedia.org{flag_image_url}",
|
"flag_image_url": f"https://en.wikipedia.org{flag_image_url}",
|
||||||
"anthem_page_url": f"https://en.wikipedia.org{anthem_page_url}",
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
def extract_flag_description(self, response: TextResponse, country_item: dict, urls: dict):
|
def extract_flag_description(self, response: TextResponse, country_item: dict, urls: dict):
|
||||||
|
"""Extract the html of the first paragraph on each country flag page."""
|
||||||
|
|
||||||
flag_description_xpath = response.xpath(
|
flag_description_xpath = response.xpath(
|
||||||
"//div[contains(@id, 'mw-content-text')]/div/p[not(contains(@class, 'mw-empty-elt'))]"
|
"//div[contains(@id, 'mw-content-text')]/div/p[not(contains(@class, 'mw-empty-elt'))]"
|
||||||
).get()
|
).get()
|
||||||
country_item = {**country_item, "flag_description": flag_description_xpath}
|
|
||||||
|
country_item = {**country_item, "flag_html": flag_description_xpath}
|
||||||
|
|
||||||
yield scrapy.Request(
|
yield scrapy.Request(
|
||||||
url=urls["flag_image_url"],
|
url=urls["flag_image_url"],
|
||||||
@@ -86,37 +88,18 @@ class CountrydownloaderSpider(scrapy.Spider):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def extract_flag_images(self, response: TextResponse, country_item: dict, urls: dict):
|
def extract_flag_images(self, response: TextResponse, country_item: dict, urls: dict):
|
||||||
|
"""Extract the image URL for each country flag."""
|
||||||
|
|
||||||
flag_image_xpath = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get()
|
flag_image_xpath = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get()
|
||||||
|
|
||||||
country_item = {**country_item, "flag_image_url": f"https:{flag_image_xpath}"}
|
country_item = {**country_item, "flag_image_url": f"https:{flag_image_xpath}"}
|
||||||
|
|
||||||
yield scrapy.Request(
|
# yield the country item containing scraped data
|
||||||
url=urls["anthem_page_url"],
|
|
||||||
callback=self.extract_anthem_file,
|
|
||||||
cb_kwargs={
|
|
||||||
"country_item": country_item,
|
|
||||||
"urls": urls,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
def extract_anthem_file(self, response: TextResponse, country_item: dict, urls: dict):
|
|
||||||
anthem_text = response.xpath("//div[@id='mw-content-text']/div/p").get()
|
|
||||||
_anthem_file_url = response.xpath("//tr[contains(@class, 'haudio')]//a/@href").getall()
|
|
||||||
|
|
||||||
anthem_file_url = next(
|
|
||||||
(file for file in _anthem_file_url if file.endswith(".ogg") or file.endswith(".oga")), None
|
|
||||||
)
|
|
||||||
|
|
||||||
country_scrapy_item = WikipediaCountryScraperItem()
|
country_scrapy_item = WikipediaCountryScraperItem()
|
||||||
country_scrapy_item["country_url"] = country_item["country_url"]
|
country_scrapy_item["country_url"] = country_item["country_url"]
|
||||||
country_scrapy_item["short_country_name"] = country_item["short_country_name"]
|
country_scrapy_item["short_country_name"] = country_item["short_country_name"]
|
||||||
country_scrapy_item["country"] = country_item["country"]
|
country_scrapy_item["country_html"] = country_item["country_html"]
|
||||||
country_scrapy_item["flag_description"] = country_item["flag_description"]
|
country_scrapy_item["flag_html"] = country_item["flag_html"]
|
||||||
country_scrapy_item["anthem"] = anthem_text
|
country_scrapy_item["file_urls"] = [country_item["flag_image_url"]]
|
||||||
country_scrapy_item["anthem_url"] = urls["anthem_page_url"]
|
|
||||||
country_scrapy_item["anthem_file_url"] = f"https://en.wikipedia.org{anthem_file_url}"
|
|
||||||
country_scrapy_item["file_urls"] = [
|
|
||||||
country_item["flag_image_url"],
|
|
||||||
f"https://en.wikipedia.org{anthem_file_url}",
|
|
||||||
]
|
|
||||||
|
|
||||||
yield country_scrapy_item
|
yield country_scrapy_item
|
||||||
|
|||||||
@@ -0,0 +1,122 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
from scrapy.http import TextResponse
|
||||||
|
|
||||||
|
from wikipedia_country_scraper.items import WikipediaCountryScraperItem
|
||||||
|
|
||||||
|
|
||||||
|
class CountrydownloaderSpider(scrapy.Spider):
|
||||||
|
name = "CountrydownloaderSpider"
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
return [
|
||||||
|
scrapy.Request(
|
||||||
|
url="https://en.wikipedia.org/wiki/List_of_sovereign_states", callback=self.extract_country_urls
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract_country_urls(self, response: TextResponse):
|
||||||
|
country_urls_xpath = response.xpath(
|
||||||
|
"//table[contains(@class, 'sortable') and contains(@class, 'wikitable')]/tbody/tr[not(contains(@style, 'background'))]/td[1 and contains(@style, 'vertical-align:top;')]/b/a/@href"
|
||||||
|
).getall()
|
||||||
|
|
||||||
|
for url in country_urls_xpath:
|
||||||
|
# for url in country_urls_xpath[:3]:
|
||||||
|
regex_match = re.search(r"\/wiki\/(?P<short_country_name>[^$]*)", url)
|
||||||
|
yield scrapy.Request(
|
||||||
|
url=f"https://en.wikipedia.org{url}",
|
||||||
|
callback=self.extract_country_information,
|
||||||
|
cb_kwargs={
|
||||||
|
"country_item": {
|
||||||
|
"country_url": f"https://en.wikipedia.org{url}",
|
||||||
|
"short_country_name": regex_match["short_country_name"]
|
||||||
|
if isinstance(regex_match, re.Match)
|
||||||
|
else None,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
def extract_country_information(self, response: TextResponse, country_item: dict):
|
||||||
|
country_information_xpath = response.xpath("//table[contains(@class, 'infobox')]/tbody/tr").getall()
|
||||||
|
|
||||||
|
flag_image_url = response.xpath(
|
||||||
|
"//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[1]/a/@href"
|
||||||
|
).get()
|
||||||
|
flag_description_url = response.xpath(
|
||||||
|
"//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[2]/a/@href"
|
||||||
|
).get()
|
||||||
|
|
||||||
|
anthem_page_url = response.xpath(
|
||||||
|
"//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//span[contains(@class, 'audio')]/a/@href"
|
||||||
|
).get()
|
||||||
|
|
||||||
|
country_item = {
|
||||||
|
**country_item,
|
||||||
|
"country": country_information_xpath,
|
||||||
|
}
|
||||||
|
|
||||||
|
yield scrapy.Request(
|
||||||
|
url=f"https://en.wikipedia.org{flag_description_url}",
|
||||||
|
callback=self.extract_flag_description,
|
||||||
|
cb_kwargs={
|
||||||
|
"country_item": country_item,
|
||||||
|
"urls": {
|
||||||
|
"flag_image_url": f"https://en.wikipedia.org{flag_image_url}",
|
||||||
|
"anthem_page_url": f"https://en.wikipedia.org{anthem_page_url}",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
def extract_flag_description(self, response: TextResponse, country_item: dict, urls: dict):
|
||||||
|
flag_description_xpath = response.xpath(
|
||||||
|
"//div[contains(@id, 'mw-content-text')]/div/p[not(contains(@class, 'mw-empty-elt'))]"
|
||||||
|
).get()
|
||||||
|
country_item = {**country_item, "flag_description": flag_description_xpath}
|
||||||
|
|
||||||
|
yield scrapy.Request(
|
||||||
|
url=urls["flag_image_url"],
|
||||||
|
callback=self.extract_flag_images,
|
||||||
|
cb_kwargs={
|
||||||
|
"country_item": country_item,
|
||||||
|
"urls": urls,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
def extract_flag_images(self, response: TextResponse, country_item: dict, urls: dict):
|
||||||
|
flag_image_xpath = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get()
|
||||||
|
country_item = {**country_item, "flag_image_url": f"https:{flag_image_xpath}"}
|
||||||
|
|
||||||
|
yield scrapy.Request(
|
||||||
|
url=urls["anthem_page_url"],
|
||||||
|
callback=self.extract_anthem_file,
|
||||||
|
cb_kwargs={
|
||||||
|
"country_item": country_item,
|
||||||
|
"urls": urls,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
def extract_anthem_file(self, response: TextResponse, country_item: dict, urls: dict):
|
||||||
|
anthem_text = response.xpath("//div[@id='mw-content-text']/div/p").get()
|
||||||
|
_anthem_file_url = response.xpath("//tr[contains(@class, 'haudio')]//a/@href").getall()
|
||||||
|
|
||||||
|
anthem_file_url = next(
|
||||||
|
(file for file in _anthem_file_url if file.endswith(".ogg") or file.endswith(".oga")), None
|
||||||
|
)
|
||||||
|
|
||||||
|
country_scrapy_item = WikipediaCountryScraperItem()
|
||||||
|
country_scrapy_item["country_url"] = country_item["country_url"]
|
||||||
|
country_scrapy_item["short_country_name"] = country_item["short_country_name"]
|
||||||
|
country_scrapy_item["country"] = country_item["country"]
|
||||||
|
country_scrapy_item["flag_description"] = country_item["flag_description"]
|
||||||
|
country_scrapy_item["anthem"] = anthem_text
|
||||||
|
country_scrapy_item["anthem_url"] = urls["anthem_page_url"]
|
||||||
|
country_scrapy_item["anthem_file_url"] = f"https://en.wikipedia.org{anthem_file_url}"
|
||||||
|
country_scrapy_item["file_urls"] = [
|
||||||
|
country_item["flag_image_url"],
|
||||||
|
f"https://en.wikipedia.org{anthem_file_url}",
|
||||||
|
]
|
||||||
|
|
||||||
|
yield country_scrapy_item
|
||||||
@@ -79,15 +79,15 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 35,
|
"execution_count": 36,
|
||||||
"id": "d03be94e-8642-4916-8a43-1711e0c21b36",
|
"id": "d03be94e-8642-4916-8a43-1711e0c21b36",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"execution": {
|
"execution": {
|
||||||
"iopub.execute_input": "2022-06-24T01:09:19.590298Z",
|
"iopub.execute_input": "2022-06-24T01:28:21.599365Z",
|
||||||
"iopub.status.busy": "2022-06-24T01:09:19.589666Z",
|
"iopub.status.busy": "2022-06-24T01:28:21.598512Z",
|
||||||
"iopub.status.idle": "2022-06-24T01:09:19.676856Z",
|
"iopub.status.idle": "2022-06-24T01:28:21.717735Z",
|
||||||
"shell.execute_reply": "2022-06-24T01:09:19.674877Z",
|
"shell.execute_reply": "2022-06-24T01:28:21.715894Z",
|
||||||
"shell.execute_reply.started": "2022-06-24T01:09:19.590267Z"
|
"shell.execute_reply.started": "2022-06-24T01:28:21.599295Z"
|
||||||
},
|
},
|
||||||
"tags": []
|
"tags": []
|
||||||
},
|
},
|
||||||
@@ -99,7 +99,7 @@
|
|||||||
"traceback": [
|
"traceback": [
|
||||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
"\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)",
|
"\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)",
|
||||||
"Input \u001b[0;32mIn [35]\u001b[0m, in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m countries_file \u001b[38;5;241m=\u001b[39m data_directory \u001b[38;5;241m/\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcountries.json\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 2\u001b[0m countries \u001b[38;5;241m=\u001b[39m \u001b[43mjson\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloads\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcountries_file\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
|
"Input \u001b[0;32mIn [36]\u001b[0m, in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m countries_file \u001b[38;5;241m=\u001b[39m data_directory \u001b[38;5;241m/\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcountries.json\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 2\u001b[0m countries \u001b[38;5;241m=\u001b[39m \u001b[43mjson\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloads\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcountries_file\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||||
"File \u001b[0;32m~/.pyenv/versions/3.8.12/lib/python3.8/json/__init__.py:357\u001b[0m, in \u001b[0;36mloads\u001b[0;34m(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 352\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m kw[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mencoding\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 354\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m object_hook \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[1;32m 355\u001b[0m parse_int \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m parse_float \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[1;32m 356\u001b[0m parse_constant \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m object_pairs_hook \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m kw):\n\u001b[0;32m--> 357\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_default_decoder\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 358\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 359\u001b[0m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;241m=\u001b[39m JSONDecoder\n",
|
"File \u001b[0;32m~/.pyenv/versions/3.8.12/lib/python3.8/json/__init__.py:357\u001b[0m, in \u001b[0;36mloads\u001b[0;34m(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 352\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m kw[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mencoding\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 354\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m object_hook \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[1;32m 355\u001b[0m parse_int \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m parse_float \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[1;32m 356\u001b[0m parse_constant \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m object_pairs_hook \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m kw):\n\u001b[0;32m--> 357\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_default_decoder\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 358\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 359\u001b[0m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;241m=\u001b[39m JSONDecoder\n",
|
||||||
"File \u001b[0;32m~/.pyenv/versions/3.8.12/lib/python3.8/json/decoder.py:340\u001b[0m, in \u001b[0;36mJSONDecoder.decode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m 338\u001b[0m end \u001b[38;5;241m=\u001b[39m _w(s, end)\u001b[38;5;241m.\u001b[39mend()\n\u001b[1;32m 339\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m end \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mlen\u001b[39m(s):\n\u001b[0;32m--> 340\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m JSONDecodeError(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExtra data\u001b[39m\u001b[38;5;124m\"\u001b[39m, s, end)\n\u001b[1;32m 341\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\n",
|
"File \u001b[0;32m~/.pyenv/versions/3.8.12/lib/python3.8/json/decoder.py:340\u001b[0m, in \u001b[0;36mJSONDecoder.decode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m 338\u001b[0m end \u001b[38;5;241m=\u001b[39m _w(s, end)\u001b[38;5;241m.\u001b[39mend()\n\u001b[1;32m 339\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m end \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mlen\u001b[39m(s):\n\u001b[0;32m--> 340\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m JSONDecodeError(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExtra data\u001b[39m\u001b[38;5;124m\"\u001b[39m, s, end)\n\u001b[1;32m 341\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\n",
|
||||||
"\u001b[0;31mJSONDecodeError\u001b[0m: Extra data: line 83 column 2 (char 2294639)"
|
"\u001b[0;31mJSONDecodeError\u001b[0m: Extra data: line 83 column 2 (char 2294639)"
|
||||||
@@ -114,19 +114,38 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 32,
|
"execution_count": 37,
|
||||||
"id": "29cca9ea-16d3-4534-8c9e-49fde37f8cdd",
|
"id": "29cca9ea-16d3-4534-8c9e-49fde37f8cdd",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"execution": {
|
"execution": {
|
||||||
"iopub.execute_input": "2022-06-24T00:48:48.927613Z",
|
"iopub.execute_input": "2022-06-24T01:28:23.587271Z",
|
||||||
"iopub.status.busy": "2022-06-24T00:48:48.926883Z",
|
"iopub.status.busy": "2022-06-24T01:28:23.586601Z",
|
||||||
"iopub.status.idle": "2022-06-24T00:48:49.010610Z",
|
"iopub.status.idle": "2022-06-24T01:28:23.803584Z",
|
||||||
"shell.execute_reply": "2022-06-24T00:48:49.008078Z",
|
"shell.execute_reply": "2022-06-24T01:28:23.801073Z",
|
||||||
"shell.execute_reply.started": "2022-06-24T00:48:48.927549Z"
|
"shell.execute_reply.started": "2022-06-24T01:28:23.587242Z"
|
||||||
},
|
},
|
||||||
"tags": []
|
"tags": []
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "ValueError",
|
||||||
|
"evalue": "Trailing data",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"Input \u001b[0;32mIn [37]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_json\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcountries_file\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||||
|
"File \u001b[0;32m~/git-repos/geography-anki/playground/downloaded_data_inspection/.venv/lib/python3.8/site-packages/pandas/util/_decorators.py:207\u001b[0m, in \u001b[0;36mdeprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 205\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 206\u001b[0m kwargs[new_arg_name] \u001b[38;5;241m=\u001b[39m new_arg_value\n\u001b[0;32m--> 207\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||||
|
"File \u001b[0;32m~/git-repos/geography-anki/playground/downloaded_data_inspection/.venv/lib/python3.8/site-packages/pandas/util/_decorators.py:311\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 305\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(args) \u001b[38;5;241m>\u001b[39m num_allow_args:\n\u001b[1;32m 306\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 307\u001b[0m msg\u001b[38;5;241m.\u001b[39mformat(arguments\u001b[38;5;241m=\u001b[39marguments),\n\u001b[1;32m 308\u001b[0m \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[1;32m 309\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39mstacklevel,\n\u001b[1;32m 310\u001b[0m )\n\u001b[0;32m--> 311\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||||
|
"File \u001b[0;32m~/git-repos/geography-anki/playground/downloaded_data_inspection/.venv/lib/python3.8/site-packages/pandas/io/json/_json.py:612\u001b[0m, in \u001b[0;36mread_json\u001b[0;34m(path_or_buf, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit, encoding, encoding_errors, lines, chunksize, compression, nrows, storage_options)\u001b[0m\n\u001b[1;32m 609\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m json_reader\n\u001b[1;32m 611\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m json_reader:\n\u001b[0;32m--> 612\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mjson_reader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||||
|
"File \u001b[0;32m~/git-repos/geography-anki/playground/downloaded_data_inspection/.venv/lib/python3.8/site-packages/pandas/io/json/_json.py:746\u001b[0m, in \u001b[0;36mJsonReader.read\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 744\u001b[0m obj \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_object_parser(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_combine_lines(data_lines))\n\u001b[1;32m 745\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 746\u001b[0m obj \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_object_parser\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 747\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclose()\n\u001b[1;32m 748\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\n",
|
||||||
|
"File \u001b[0;32m~/git-repos/geography-anki/playground/downloaded_data_inspection/.venv/lib/python3.8/site-packages/pandas/io/json/_json.py:768\u001b[0m, in \u001b[0;36mJsonReader._get_object_parser\u001b[0;34m(self, json)\u001b[0m\n\u001b[1;32m 766\u001b[0m obj \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 767\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m typ \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mframe\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m--> 768\u001b[0m obj \u001b[38;5;241m=\u001b[39m \u001b[43mFrameParser\u001b[49m\u001b[43m(\u001b[49m\u001b[43mjson\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 770\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m typ \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mseries\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m obj \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 771\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dtype, \u001b[38;5;28mbool\u001b[39m):\n",
|
||||||
|
"File \u001b[0;32m~/git-repos/geography-anki/playground/downloaded_data_inspection/.venv/lib/python3.8/site-packages/pandas/io/json/_json.py:880\u001b[0m, in \u001b[0;36mParser.parse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 878\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_parse_numpy()\n\u001b[1;32m 879\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 880\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_parse_no_numpy\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 882\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobj \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 883\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n",
|
||||||
|
"File \u001b[0;32m~/git-repos/geography-anki/playground/downloaded_data_inspection/.venv/lib/python3.8/site-packages/pandas/io/json/_json.py:1133\u001b[0m, in \u001b[0;36mFrameParser._parse_no_numpy\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1129\u001b[0m orient \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39morient\n\u001b[1;32m 1131\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m orient \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcolumns\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 1132\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobj \u001b[38;5;241m=\u001b[39m DataFrame(\n\u001b[0;32m-> 1133\u001b[0m \u001b[43mloads\u001b[49m\u001b[43m(\u001b[49m\u001b[43mjson\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprecise_float\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprecise_float\u001b[49m\u001b[43m)\u001b[49m, dtype\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1134\u001b[0m )\n\u001b[1;32m 1135\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m orient \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msplit\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 1136\u001b[0m decoded \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 1137\u001b[0m \u001b[38;5;28mstr\u001b[39m(k): v\n\u001b[1;32m 1138\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m loads(json, precise_float\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprecise_float)\u001b[38;5;241m.\u001b[39mitems()\n\u001b[1;32m 1139\u001b[0m }\n",
|
||||||
|
"\u001b[0;31mValueError\u001b[0m: Trailing data"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"df = pd.read_json(countries_file)"
|
"df = pd.read_json(countries_file)"
|
||||||
]
|
]
|
||||||
|
|||||||
Reference in New Issue
Block a user