{
"cells": [
{
"cell_type": "code",
"execution_count": 38,
"id": "d8185790-0793-4881-99e8-6730f95a8006",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T21:31:57.297266Z",
"iopub.status.busy": "2022-06-24T21:31:57.284090Z",
"iopub.status.idle": "2022-06-24T21:31:57.366471Z",
"shell.execute_reply": "2022-06-24T21:31:57.365193Z",
"shell.execute_reply.started": "2022-06-24T21:31:57.293844Z"
},
"tags": []
},
"outputs": [],
"source": [
"import json\n",
"import pathlib\n",
"\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "ea2b3e33-d58e-4e30-a0cc-8218a1f252c9",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T21:31:58.025200Z",
"iopub.status.busy": "2022-06-24T21:31:58.024201Z",
"iopub.status.idle": "2022-06-24T21:31:58.108904Z",
"shell.execute_reply": "2022-06-24T21:31:58.107402Z",
"shell.execute_reply.started": "2022-06-24T21:31:58.025121Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"[None]"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd_options = {\n",
" \"display.max_rows\": None,\n",
"}\n",
"\n",
"[pd.set_option(option, value) for option, value in pd_options.items()]"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "36149580-91d9-431d-99c3-51feee829e79",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T21:31:58.650508Z",
"iopub.status.busy": "2022-06-24T21:31:58.650001Z",
"iopub.status.idle": "2022-06-24T21:31:58.670264Z",
"shell.execute_reply": "2022-06-24T21:31:58.669296Z",
"shell.execute_reply.started": "2022-06-24T21:31:58.650473Z"
},
"tags": []
},
"outputs": [],
"source": [
"data_directory = (\n",
" pathlib.Path(\".\").resolve().parents[1] / \"data\" / \"scrapy\" / \"raw_country_data\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 74,
"id": "d03be94e-8642-4916-8a43-1711e0c21b36",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T21:52:35.726961Z",
"iopub.status.busy": "2022-06-24T21:52:35.726356Z",
"iopub.status.idle": "2022-06-24T21:52:35.823456Z",
"shell.execute_reply": "2022-06-24T21:52:35.822464Z",
"shell.execute_reply.started": "2022-06-24T21:52:35.726932Z"
},
"tags": []
},
"outputs": [],
"source": [
"countries_file = data_directory / \"countries.json\"\n",
"countries = json.loads(countries_file.read_text())\n",
"# countries"
]
},
{
"cell_type": "code",
"execution_count": 75,
"id": "29cca9ea-16d3-4534-8c9e-49fde37f8cdd",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T21:52:36.453655Z",
"iopub.status.busy": "2022-06-24T21:52:36.452202Z",
"iopub.status.idle": "2022-06-24T21:52:36.555461Z",
"shell.execute_reply": "2022-06-24T21:52:36.554096Z",
"shell.execute_reply.started": "2022-06-24T21:52:36.453559Z"
},
"tags": []
},
"outputs": [],
"source": [
"df = pd.read_json(countries_file)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"id": "ef8bc3ce-08dd-4260-807c-2616b2e1c1ba",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T21:52:36.950435Z",
"iopub.status.busy": "2022-06-24T21:52:36.949946Z",
"iopub.status.idle": "2022-06-24T21:52:36.959935Z",
"shell.execute_reply": "2022-06-24T21:52:36.958581Z",
"shell.execute_reply.started": "2022-06-24T21:52:36.950398Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(206,)\n",
"[False]\n",
"[False]\n"
]
}
],
"source": [
"country_url = df[\"country_url\"]\n",
"print(country_url.shape)\n",
"print(country_url.isnull().unique())\n",
"print(country_url.isna().unique())"
]
},
{
"cell_type": "code",
"execution_count": 77,
"id": "48db8f93-659b-45a4-8477-a7cec139bebc",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T21:52:37.635010Z",
"iopub.status.busy": "2022-06-24T21:52:37.634417Z",
"iopub.status.idle": "2022-06-24T21:52:37.645162Z",
"shell.execute_reply": "2022-06-24T21:52:37.643796Z",
"shell.execute_reply.started": "2022-06-24T21:52:37.634953Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(206,)\n",
"[False]\n",
"[False]\n"
]
}
],
"source": [
"short_country_name = df[\"short_country_name\"]\n",
"print(short_country_name.shape)\n",
"print(short_country_name.isnull().unique())\n",
"print(short_country_name.isna().unique())"
]
},
{
"cell_type": "code",
"execution_count": 81,
"id": "a52f6aa2-5bbd-46e4-9b2f-cdbd7269cb6e",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T21:52:49.070882Z",
"iopub.status.busy": "2022-06-24T21:52:49.070107Z",
"iopub.status.idle": "2022-06-24T21:52:49.076033Z",
"shell.execute_reply": "2022-06-24T21:52:49.075243Z",
"shell.execute_reply.started": "2022-06-24T21:52:49.070853Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(206,)\n",
"[False True]\n",
"[False True]\n"
]
}
],
"source": [
"flag_html = df[\"flag_html\"]\n",
"print(flag_html.shape)\n",
"print(flag_html.isnull().unique())\n",
"print(flag_html.isna().unique())"
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "643e6512-1e5b-4eb2-9f0a-6b680ada787b",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T21:53:15.797108Z",
"iopub.status.busy": "2022-06-24T21:53:15.796761Z",
"iopub.status.idle": "2022-06-24T21:53:15.809904Z",
"shell.execute_reply": "2022-06-24T21:53:15.809157Z",
"shell.execute_reply.started": "2022-06-24T21:53:15.797079Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" country_url | \n",
" flag_image_url | \n",
" short_country_name | \n",
" country_html | \n",
" flag_html | \n",
" file_urls | \n",
" files | \n",
"
\n",
" \n",
" \n",
" \n",
" | 84 | \n",
" https://en.wikipedia.org/wiki/Paraguay | \n",
" https://en.wikipedia.org/wiki/File:Flag_of_Par... | \n",
" Paraguay | \n",
" [<tr><th colspan=\"2\" class=\"infobox-above adr\"... | \n",
" None | \n",
" [https:////upload.wikimedia.org/wikipedia/comm... | \n",
" [{'url': 'https://upload.wikimedia.org/wikiped... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" country_url \\\n",
"84 https://en.wikipedia.org/wiki/Paraguay \n",
"\n",
" flag_image_url short_country_name \\\n",
"84 https://en.wikipedia.org/wiki/File:Flag_of_Par... Paraguay \n",
"\n",
" country_html flag_html \\\n",
"84 [\n",
"\n",
"\n",
" \n",
" \n",
" | \n",
" country_url | \n",
" flag_image_url | \n",
" short_country_name | \n",
" country_html | \n",
" flag_html | \n",
" file_urls | \n",
" files | \n",
" \n",
" \n",
" \n",
" \n",
" | 84 | \n",
" https://en.wikipedia.org/wiki/Paraguay | \n",
" https://en.wikipedia.org/wiki/File:Flag_of_Par... | \n",
" Paraguay | \n",
" [<tr><th colspan=\"2\" class=\"infobox-above adr\"... | \n",
" None | \n",
" [https:////upload.wikimedia.org/wikipedia/comm... | \n",
" [{'url': 'https://upload.wikimedia.org/wikiped... | \n",
" \n",
" \n",
" \n",
""
],
"text/plain": [
" country_url \\\n",
"84 https://en.wikipedia.org/wiki/Paraguay \n",
"\n",
" flag_image_url short_country_name \\\n",
"84 https://en.wikipedia.org/wiki/File:Flag_of_Par... Paraguay \n",
"\n",
" country_html flag_html \\\n",
"84 [ |
|---|
|
|---|