341 lines
13 KiB
Plaintext
341 lines
13 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "d8185790-0793-4881-99e8-6730f95a8006",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2022-06-23T23:01:31.065905Z",
|
|
"iopub.status.busy": "2022-06-23T23:01:31.065496Z",
|
|
"iopub.status.idle": "2022-06-23T23:01:31.544067Z",
|
|
"shell.execute_reply": "2022-06-23T23:01:31.542907Z",
|
|
"shell.execute_reply.started": "2022-06-23T23:01:31.065831Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import json\n",
|
|
"import pathlib\n",
|
|
"\n",
|
|
"import pandas as pd"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "ea2b3e33-d58e-4e30-a0cc-8218a1f252c9",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2022-06-23T23:01:31.792796Z",
|
|
"iopub.status.busy": "2022-06-23T23:01:31.792121Z",
|
|
"iopub.status.idle": "2022-06-23T23:01:31.808265Z",
|
|
"shell.execute_reply": "2022-06-23T23:01:31.807317Z",
|
|
"shell.execute_reply.started": "2022-06-23T23:01:31.792751Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"[None]"
|
|
]
|
|
},
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"pd_options = {\n",
|
|
" \"display.max_rows\": None,\n",
|
|
"}\n",
|
|
"\n",
|
|
"[pd.set_option(option, value) for option, value in pd_options.items()]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"id": "36149580-91d9-431d-99c3-51feee829e79",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2022-06-24T00:47:50.283172Z",
|
|
"iopub.status.busy": "2022-06-24T00:47:50.282750Z",
|
|
"iopub.status.idle": "2022-06-24T00:47:50.301549Z",
|
|
"shell.execute_reply": "2022-06-24T00:47:50.300747Z",
|
|
"shell.execute_reply.started": "2022-06-24T00:47:50.283143Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"data_directory = (\n",
|
|
" pathlib.Path(\".\").resolve().parents[1] / \"data\" / \"scrapy\" / \"raw_country_data\"\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 35,
|
|
"id": "d03be94e-8642-4916-8a43-1711e0c21b36",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2022-06-24T01:09:19.590298Z",
|
|
"iopub.status.busy": "2022-06-24T01:09:19.589666Z",
|
|
"iopub.status.idle": "2022-06-24T01:09:19.676856Z",
|
|
"shell.execute_reply": "2022-06-24T01:09:19.674877Z",
|
|
"shell.execute_reply.started": "2022-06-24T01:09:19.590267Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"ename": "JSONDecodeError",
|
|
"evalue": "Extra data: line 83 column 2 (char 2294639)",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)",
|
|
"Input \u001b[0;32mIn [35]\u001b[0m, in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m countries_file \u001b[38;5;241m=\u001b[39m data_directory \u001b[38;5;241m/\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcountries.json\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 2\u001b[0m countries \u001b[38;5;241m=\u001b[39m \u001b[43mjson\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloads\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcountries_file\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
|
|
"File \u001b[0;32m~/.pyenv/versions/3.8.12/lib/python3.8/json/__init__.py:357\u001b[0m, in \u001b[0;36mloads\u001b[0;34m(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 352\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m kw[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mencoding\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 354\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m object_hook \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[1;32m 355\u001b[0m parse_int \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m parse_float \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[1;32m 356\u001b[0m parse_constant \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m object_pairs_hook \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m kw):\n\u001b[0;32m--> 357\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_default_decoder\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 358\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 359\u001b[0m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;241m=\u001b[39m JSONDecoder\n",
|
|
"File \u001b[0;32m~/.pyenv/versions/3.8.12/lib/python3.8/json/decoder.py:340\u001b[0m, in \u001b[0;36mJSONDecoder.decode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m 338\u001b[0m end \u001b[38;5;241m=\u001b[39m _w(s, end)\u001b[38;5;241m.\u001b[39mend()\n\u001b[1;32m 339\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m end \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mlen\u001b[39m(s):\n\u001b[0;32m--> 340\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m JSONDecodeError(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExtra data\u001b[39m\u001b[38;5;124m\"\u001b[39m, s, end)\n\u001b[1;32m 341\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\n",
|
|
"\u001b[0;31mJSONDecodeError\u001b[0m: Extra data: line 83 column 2 (char 2294639)"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"countries_file = data_directory / \"countries.json\"\n",
|
|
"countries = json.loads(countries_file.read_text())\n",
|
|
"# countries"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 32,
|
|
"id": "29cca9ea-16d3-4534-8c9e-49fde37f8cdd",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2022-06-24T00:48:48.927613Z",
|
|
"iopub.status.busy": "2022-06-24T00:48:48.926883Z",
|
|
"iopub.status.idle": "2022-06-24T00:48:49.010610Z",
|
|
"shell.execute_reply": "2022-06-24T00:48:49.008078Z",
|
|
"shell.execute_reply.started": "2022-06-24T00:48:48.927549Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = pd.read_json(countries_file)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 33,
|
|
"id": "ef8bc3ce-08dd-4260-807c-2616b2e1c1ba",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2022-06-24T00:48:51.018167Z",
|
|
"iopub.status.busy": "2022-06-24T00:48:51.017745Z",
|
|
"iopub.status.idle": "2022-06-24T00:48:51.023756Z",
|
|
"shell.execute_reply": "2022-06-24T00:48:51.022902Z",
|
|
"shell.execute_reply.started": "2022-06-24T00:48:51.018137Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"(81,)\n",
|
|
"[False]\n",
|
|
"[False]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"country_url = df[\"country_url\"]\n",
|
|
"print(country_url.shape)\n",
|
|
"print(country_url.isnull().unique())\n",
|
|
"print(country_url.isna().unique())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 34,
|
|
"id": "48db8f93-659b-45a4-8477-a7cec139bebc",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2022-06-24T00:48:52.316175Z",
|
|
"iopub.status.busy": "2022-06-24T00:48:52.315575Z",
|
|
"iopub.status.idle": "2022-06-24T00:48:52.323965Z",
|
|
"shell.execute_reply": "2022-06-24T00:48:52.323184Z",
|
|
"shell.execute_reply.started": "2022-06-24T00:48:52.316146Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"(81,)\n",
|
|
"[False]\n",
|
|
"[False]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"short_country_name = df[\"short_country_name\"]\n",
|
|
"print(short_country_name.shape)\n",
|
|
"print(short_country_name.isnull().unique())\n",
|
|
"print(short_country_name.isna().unique())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 29,
|
|
"id": "a52f6aa2-5bbd-46e4-9b2f-cdbd7269cb6e",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2022-06-24T00:47:57.991196Z",
|
|
"iopub.status.busy": "2022-06-24T00:47:57.990582Z",
|
|
"iopub.status.idle": "2022-06-24T00:47:58.001189Z",
|
|
"shell.execute_reply": "2022-06-24T00:47:57.999654Z",
|
|
"shell.execute_reply.started": "2022-06-24T00:47:57.991142Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"(81,)\n",
|
|
"[False]\n",
|
|
"[False]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"flag_description = df[\"flag_description\"]\n",
|
|
"print(flag_description.shape)\n",
|
|
"print(flag_description.isnull().unique())\n",
|
|
"print(flag_description.isna().unique())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 30,
|
|
"id": "643e6512-1e5b-4eb2-9f0a-6b680ada787b",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2022-06-24T00:47:59.412325Z",
|
|
"iopub.status.busy": "2022-06-24T00:47:59.411973Z",
|
|
"iopub.status.idle": "2022-06-24T00:47:59.420681Z",
|
|
"shell.execute_reply": "2022-06-24T00:47:59.419781Z",
|
|
"shell.execute_reply.started": "2022-06-24T00:47:59.412296Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"7\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"[[{'url': 'https://upload.wikimedia.org/wikipedia/commons/5/5c/Flag_of_the_Taliban.svg',\n",
|
|
" 'path': 'files/flags/Flag_of_the_Taliban.svg',\n",
|
|
" 'checksum': '153b7b9dc8133d542e744f5ff6102710',\n",
|
|
" 'status': 'downloaded'}],\n",
|
|
" [{'url': 'https://upload.wikimedia.org/wikipedia/commons/0/01/Flag_of_Niue.svg',\n",
|
|
" 'path': 'files/flags/Flag_of_Niue.svg',\n",
|
|
" 'checksum': 'ce971e9afe79c9a63fd706a617b34ce2',\n",
|
|
" 'status': 'downloaded'}],\n",
|
|
" [{'url': 'https://upload.wikimedia.org/wikipedia/commons/4/4e/Flag_of_Uganda.svg',\n",
|
|
" 'path': 'files/flags/Flag_of_Uganda.svg',\n",
|
|
" 'checksum': 'f8bb736e5832232610b5b65dd3c0a121',\n",
|
|
" 'status': 'downloaded'}],\n",
|
|
" [{'url': 'https://upload.wikimedia.org/wikipedia/commons/e/e4/Flag_of_the_Federated_States_of_Micronesia.svg',\n",
|
|
" 'path': 'files/flags/Flag_of_the_Federated_States_of_Micronesia.svg',\n",
|
|
" 'checksum': 'cfc3756759f4002983b49217456fc8e4',\n",
|
|
" 'status': 'downloaded'}],\n",
|
|
" [{'url': 'https://upload.wikimedia.org/wikipedia/commons/4/49/Flag_of_Kenya.svg',\n",
|
|
" 'path': 'files/flags/Flag_of_Kenya.svg',\n",
|
|
" 'checksum': 'aa572e0e7ad47c23e37633f1b370da8d',\n",
|
|
" 'status': 'downloaded'}],\n",
|
|
" [{'url': 'https://upload.wikimedia.org/wikipedia/commons/9/91/Flag_of_Bhutan.svg',\n",
|
|
" 'path': 'files/flags/Flag_of_Bhutan.svg',\n",
|
|
" 'checksum': 'ce4684f240e15637d2c67eb222d63fe5',\n",
|
|
" 'status': 'downloaded'}],\n",
|
|
" [{'url': 'https://upload.wikimedia.org/wikipedia/commons/8/85/Flag_of_Belarus.svg',\n",
|
|
" 'path': 'files/flags/Flag_of_Belarus.svg',\n",
|
|
" 'checksum': '22ec6af94d36453ca6e7c0830000a6c1',\n",
|
|
" 'status': 'downloaded'}]]"
|
|
]
|
|
},
|
|
"execution_count": 30,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"no_anthem = [item for item in [df[\"files\"]][0] if len(item) == 1]\n",
|
|
"print(len(no_anthem))\n",
|
|
"no_anthem"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "227b0c76-9e45-4849-849e-36355976cba9",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df[\"short_country_name\"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "f7712d7d-9074-4fc5-89f2-6e5f47c57d20",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.12"
|
|
},
|
|
"toc-autonumbering": true,
|
|
"toc-showcode": false
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|