{ "cells": [ { "cell_type": "code", "execution_count": 38, "id": "d8185790-0793-4881-99e8-6730f95a8006", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T21:31:57.297266Z", "iopub.status.busy": "2022-06-24T21:31:57.284090Z", "iopub.status.idle": "2022-06-24T21:31:57.366471Z", "shell.execute_reply": "2022-06-24T21:31:57.365193Z", "shell.execute_reply.started": "2022-06-24T21:31:57.293844Z" }, "tags": [] }, "outputs": [], "source": [ "import json\n", "import pathlib\n", "\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 39, "id": "ea2b3e33-d58e-4e30-a0cc-8218a1f252c9", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T21:31:58.025200Z", "iopub.status.busy": "2022-06-24T21:31:58.024201Z", "iopub.status.idle": "2022-06-24T21:31:58.108904Z", "shell.execute_reply": "2022-06-24T21:31:58.107402Z", "shell.execute_reply.started": "2022-06-24T21:31:58.025121Z" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "[None]" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_options = {\n", " \"display.max_rows\": None,\n", "}\n", "\n", "[pd.set_option(option, value) for option, value in pd_options.items()]" ] }, { "cell_type": "code", "execution_count": 40, "id": "36149580-91d9-431d-99c3-51feee829e79", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T21:31:58.650508Z", "iopub.status.busy": "2022-06-24T21:31:58.650001Z", "iopub.status.idle": "2022-06-24T21:31:58.670264Z", "shell.execute_reply": "2022-06-24T21:31:58.669296Z", "shell.execute_reply.started": "2022-06-24T21:31:58.650473Z" }, "tags": [] }, "outputs": [], "source": [ "data_directory = (\n", " pathlib.Path(\".\").resolve().parents[1] / \"data\" / \"scrapy\" / \"raw_country_data\"\n", ")" ] }, { "cell_type": "code", "execution_count": 41, "id": "d03be94e-8642-4916-8a43-1711e0c21b36", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T21:32:00.473759Z", "iopub.status.busy": "2022-06-24T21:32:00.473129Z", "iopub.status.idle": "2022-06-24T21:32:00.812851Z", "shell.execute_reply": "2022-06-24T21:32:00.812131Z", "shell.execute_reply.started": "2022-06-24T21:32:00.473730Z" }, "tags": [] }, "outputs": [], "source": [ "countries_file = data_directory / \"countries.json\"\n", "countries = json.loads(countries_file.read_text())\n", "# countries" ] }, { "cell_type": "code", "execution_count": 42, "id": "29cca9ea-16d3-4534-8c9e-49fde37f8cdd", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T21:32:01.728669Z", "iopub.status.busy": "2022-06-24T21:32:01.728021Z", "iopub.status.idle": "2022-06-24T21:32:01.919644Z", "shell.execute_reply": "2022-06-24T21:32:01.917919Z", "shell.execute_reply.started": "2022-06-24T21:32:01.728629Z" }, "tags": [] }, "outputs": [], "source": [ "df = pd.read_json(countries_file)" ] }, { "cell_type": "code", "execution_count": 43, "id": "ef8bc3ce-08dd-4260-807c-2616b2e1c1ba", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T21:32:04.251701Z", "iopub.status.busy": "2022-06-24T21:32:04.250831Z", "iopub.status.idle": "2022-06-24T21:32:04.263351Z", "shell.execute_reply": "2022-06-24T21:32:04.262556Z", "shell.execute_reply.started": "2022-06-24T21:32:04.251670Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(206,)\n", "[False]\n", "[False]\n" ] } ], "source": [ "country_url = df[\"country_url\"]\n", "print(country_url.shape)\n", "print(country_url.isnull().unique())\n", "print(country_url.isna().unique())" ] }, { "cell_type": "code", "execution_count": 44, "id": "48db8f93-659b-45a4-8477-a7cec139bebc", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T21:32:05.191313Z", "iopub.status.busy": "2022-06-24T21:32:05.190427Z", "iopub.status.idle": "2022-06-24T21:32:05.233768Z", "shell.execute_reply": "2022-06-24T21:32:05.232310Z", "shell.execute_reply.started": "2022-06-24T21:32:05.191232Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(206,)\n", "[False]\n", "[False]\n" ] } ], "source": [ "short_country_name = df[\"short_country_name\"]\n", "print(short_country_name.shape)\n", "print(short_country_name.isnull().unique())\n", "print(short_country_name.isna().unique())" ] }, { "cell_type": "code", "execution_count": 58, "id": "a52f6aa2-5bbd-46e4-9b2f-cdbd7269cb6e", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T21:36:11.684046Z", "iopub.status.busy": "2022-06-24T21:36:11.683658Z", "iopub.status.idle": "2022-06-24T21:36:11.690174Z", "shell.execute_reply": "2022-06-24T21:36:11.689279Z", "shell.execute_reply.started": "2022-06-24T21:36:11.684015Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(206,)\n", "[False True]\n", "[False True]\n" ] } ], "source": [ "flag_html = df[\"flag_html\"]\n", "print(flag_html.shape)\n", "print(flag_html.isnull().unique())\n", "print(flag_html.isna().unique())" ] }, { "cell_type": "code", "execution_count": 59, "id": "643e6512-1e5b-4eb2-9f0a-6b680ada787b", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T21:36:13.608192Z", "iopub.status.busy": "2022-06-24T21:36:13.607778Z", "iopub.status.idle": "2022-06-24T21:36:13.623089Z", "shell.execute_reply": "2022-06-24T21:36:13.622311Z", "shell.execute_reply.started": "2022-06-24T21:36:13.608162Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
country_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
84https://en.wikipedia.org/wiki/ParaguayParaguay[<tr><th colspan=\"2\" class=\"infobox-above adr\"...None[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
\n", "
" ], "text/plain": [ " country_url short_country_name \\\n", "84 https://en.wikipedia.org/wiki/Paraguay Paraguay \n", "\n", " country_html flag_html \\\n", "84 [