{ "cells": [ { "cell_type": "code", "execution_count": 38, "id": "d8185790-0793-4881-99e8-6730f95a8006", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T21:31:57.297266Z", "iopub.status.busy": "2022-06-24T21:31:57.284090Z", "iopub.status.idle": "2022-06-24T21:31:57.366471Z", "shell.execute_reply": "2022-06-24T21:31:57.365193Z", "shell.execute_reply.started": "2022-06-24T21:31:57.293844Z" }, "tags": [] }, "outputs": [], "source": [ "import json\n", "import pathlib\n", "\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 39, "id": "ea2b3e33-d58e-4e30-a0cc-8218a1f252c9", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T21:31:58.025200Z", "iopub.status.busy": "2022-06-24T21:31:58.024201Z", "iopub.status.idle": "2022-06-24T21:31:58.108904Z", "shell.execute_reply": "2022-06-24T21:31:58.107402Z", "shell.execute_reply.started": "2022-06-24T21:31:58.025121Z" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "[None]" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_options = {\n", " \"display.max_rows\": None,\n", "}\n", "\n", "[pd.set_option(option, value) for option, value in pd_options.items()]" ] }, { "cell_type": "code", "execution_count": 40, "id": "36149580-91d9-431d-99c3-51feee829e79", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T21:31:58.650508Z", "iopub.status.busy": "2022-06-24T21:31:58.650001Z", "iopub.status.idle": "2022-06-24T21:31:58.670264Z", "shell.execute_reply": "2022-06-24T21:31:58.669296Z", "shell.execute_reply.started": "2022-06-24T21:31:58.650473Z" }, "tags": [] }, "outputs": [], "source": [ "data_directory = (\n", " pathlib.Path(\".\").resolve().parents[1] / \"data\" / \"scrapy\" / \"raw_country_data\"\n", ")" ] }, { "cell_type": "code", "execution_count": 74, "id": "d03be94e-8642-4916-8a43-1711e0c21b36", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T21:52:35.726961Z", "iopub.status.busy": "2022-06-24T21:52:35.726356Z", "iopub.status.idle": "2022-06-24T21:52:35.823456Z", "shell.execute_reply": "2022-06-24T21:52:35.822464Z", "shell.execute_reply.started": "2022-06-24T21:52:35.726932Z" }, "tags": [] }, "outputs": [], "source": [ "countries_file = data_directory / \"countries.json\"\n", "countries = json.loads(countries_file.read_text())\n", "# countries" ] }, { "cell_type": "code", "execution_count": 75, "id": "29cca9ea-16d3-4534-8c9e-49fde37f8cdd", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T21:52:36.453655Z", "iopub.status.busy": "2022-06-24T21:52:36.452202Z", "iopub.status.idle": "2022-06-24T21:52:36.555461Z", "shell.execute_reply": "2022-06-24T21:52:36.554096Z", "shell.execute_reply.started": "2022-06-24T21:52:36.453559Z" }, "tags": [] }, "outputs": [], "source": [ "df = pd.read_json(countries_file)" ] }, { "cell_type": "code", "execution_count": 76, "id": "ef8bc3ce-08dd-4260-807c-2616b2e1c1ba", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T21:52:36.950435Z", "iopub.status.busy": "2022-06-24T21:52:36.949946Z", "iopub.status.idle": "2022-06-24T21:52:36.959935Z", "shell.execute_reply": "2022-06-24T21:52:36.958581Z", "shell.execute_reply.started": "2022-06-24T21:52:36.950398Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(206,)\n", "[False]\n", "[False]\n" ] } ], "source": [ "country_url = df[\"country_url\"]\n", "print(country_url.shape)\n", "print(country_url.isnull().unique())\n", "print(country_url.isna().unique())" ] }, { "cell_type": "code", "execution_count": 77, "id": "48db8f93-659b-45a4-8477-a7cec139bebc", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T21:52:37.635010Z", "iopub.status.busy": "2022-06-24T21:52:37.634417Z", "iopub.status.idle": "2022-06-24T21:52:37.645162Z", "shell.execute_reply": "2022-06-24T21:52:37.643796Z", "shell.execute_reply.started": "2022-06-24T21:52:37.634953Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(206,)\n", "[False]\n", "[False]\n" ] } ], "source": [ "short_country_name = df[\"short_country_name\"]\n", "print(short_country_name.shape)\n", "print(short_country_name.isnull().unique())\n", "print(short_country_name.isna().unique())" ] }, { "cell_type": "code", "execution_count": 81, "id": "a52f6aa2-5bbd-46e4-9b2f-cdbd7269cb6e", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T21:52:49.070882Z", "iopub.status.busy": "2022-06-24T21:52:49.070107Z", "iopub.status.idle": "2022-06-24T21:52:49.076033Z", "shell.execute_reply": "2022-06-24T21:52:49.075243Z", "shell.execute_reply.started": "2022-06-24T21:52:49.070853Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(206,)\n", "[False True]\n", "[False True]\n" ] } ], "source": [ "flag_html = df[\"flag_html\"]\n", "print(flag_html.shape)\n", "print(flag_html.isnull().unique())\n", "print(flag_html.isna().unique())" ] }, { "cell_type": "code", "execution_count": 84, "id": "643e6512-1e5b-4eb2-9f0a-6b680ada787b", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T21:53:15.797108Z", "iopub.status.busy": "2022-06-24T21:53:15.796761Z", "iopub.status.idle": "2022-06-24T21:53:15.809904Z", "shell.execute_reply": "2022-06-24T21:53:15.809157Z", "shell.execute_reply.started": "2022-06-24T21:53:15.797079Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
country_urlflag_image_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
84https://en.wikipedia.org/wiki/Paraguayhttps://en.wikipedia.org/wiki/File:Flag_of_Par...Paraguay[<tr><th colspan=\"2\" class=\"infobox-above adr\"...None[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
\n", "
" ], "text/plain": [ " country_url \\\n", "84 https://en.wikipedia.org/wiki/Paraguay \n", "\n", " flag_image_url short_country_name \\\n", "84 https://en.wikipedia.org/wiki/File:Flag_of_Par... Paraguay \n", "\n", " country_html flag_html \\\n", "84 [\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
country_urlflag_image_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
84https://en.wikipedia.org/wiki/Paraguayhttps://en.wikipedia.org/wiki/File:Flag_of_Par...Paraguay[<tr><th colspan=\"2\" class=\"infobox-above adr\"...None[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
\n", "" ], "text/plain": [ " country_url \\\n", "84 https://en.wikipedia.org/wiki/Paraguay \n", "\n", " flag_image_url short_country_name \\\n", "84 https://en.wikipedia.org/wiki/File:Flag_of_Par... Paraguay \n", "\n", " country_html flag_html \\\n", "84 [