{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "d8185790-0793-4881-99e8-6730f95a8006", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T22:04:54.386982Z", "iopub.status.busy": "2022-06-24T22:04:54.386313Z", "iopub.status.idle": "2022-06-24T22:04:54.854521Z", "shell.execute_reply": "2022-06-24T22:04:54.853581Z", "shell.execute_reply.started": "2022-06-24T22:04:54.386910Z" }, "tags": [] }, "outputs": [], "source": [ "import json\n", "import pathlib\n", "\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "id": "ea2b3e33-d58e-4e30-a0cc-8218a1f252c9", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T22:04:55.458615Z", "iopub.status.busy": "2022-06-24T22:04:55.457695Z", "iopub.status.idle": "2022-06-24T22:04:55.475878Z", "shell.execute_reply": "2022-06-24T22:04:55.474706Z", "shell.execute_reply.started": "2022-06-24T22:04:55.458548Z" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "[None]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_options = {\n", " \"display.max_rows\": None,\n", "}\n", "\n", "[pd.set_option(option, value) for option, value in pd_options.items()]" ] }, { "cell_type": "code", "execution_count": 3, "id": "36149580-91d9-431d-99c3-51feee829e79", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T22:04:56.134416Z", "iopub.status.busy": "2022-06-24T22:04:56.133745Z", "iopub.status.idle": "2022-06-24T22:04:56.140326Z", "shell.execute_reply": "2022-06-24T22:04:56.138507Z", "shell.execute_reply.started": "2022-06-24T22:04:56.134371Z" }, "tags": [] }, "outputs": [], "source": [ "data_directory = (\n", " pathlib.Path(\".\").resolve().parents[1] / \"data\" / \"scrapy\" / \"raw_country_data\"\n", ")" ] }, { "cell_type": "code", "execution_count": 4, "id": "d03be94e-8642-4916-8a43-1711e0c21b36", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T22:04:56.621163Z", "iopub.status.busy": "2022-06-24T22:04:56.620692Z", "iopub.status.idle": "2022-06-24T22:04:56.731001Z", "shell.execute_reply": "2022-06-24T22:04:56.728392Z", "shell.execute_reply.started": "2022-06-24T22:04:56.621128Z" }, "tags": [] }, "outputs": [], "source": [ "countries_file = data_directory / \"countries.json\"\n", "countries = json.loads(countries_file.read_text())\n", "# countries" ] }, { "cell_type": "code", "execution_count": 25, "id": "29cca9ea-16d3-4534-8c9e-49fde37f8cdd", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T16:40:34.482040Z", "iopub.status.busy": "2022-06-25T16:40:34.481539Z", "iopub.status.idle": "2022-06-25T16:40:34.624178Z", "shell.execute_reply": "2022-06-25T16:40:34.618757Z", "shell.execute_reply.started": "2022-06-25T16:40:34.482012Z" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "Index(['country_url', 'flag_description_url', 'short_country_name',\n", " 'country_html', 'flag_html', 'file_urls', 'files'],\n", " dtype='object')" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_json(countries_file)\n", "df.columns" ] }, { "cell_type": "code", "execution_count": 24, "id": "ef8bc3ce-08dd-4260-807c-2616b2e1c1ba", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T16:40:32.210227Z", "iopub.status.busy": "2022-06-25T16:40:32.209873Z", "iopub.status.idle": "2022-06-25T16:40:32.223710Z", "shell.execute_reply": "2022-06-25T16:40:32.222746Z", "shell.execute_reply.started": "2022-06-25T16:40:32.210199Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(206,)\n", "[False]\n", "[False]\n" ] } ], "source": [ "country_url = df[\"country_url\"]\n", "print(country_url.shape)\n", "print(country_url.isnull().unique())\n", "print(country_url.isna().unique())" ] }, { "cell_type": "code", "execution_count": 7, "id": "48db8f93-659b-45a4-8477-a7cec139bebc", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T22:04:59.710467Z", "iopub.status.busy": "2022-06-24T22:04:59.709874Z", "iopub.status.idle": "2022-06-24T22:04:59.720517Z", "shell.execute_reply": "2022-06-24T22:04:59.717623Z", "shell.execute_reply.started": "2022-06-24T22:04:59.710431Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(206,)\n", "[False]\n", "[False]\n" ] } ], "source": [ "short_country_name = df[\"short_country_name\"]\n", "print(short_country_name.shape)\n", "print(short_country_name.isnull().unique())\n", "print(short_country_name.isna().unique())" ] }, { "cell_type": "code", "execution_count": 23, "id": "a52f6aa2-5bbd-46e4-9b2f-cdbd7269cb6e", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T16:40:28.811675Z", "iopub.status.busy": "2022-06-25T16:40:28.810872Z", "iopub.status.idle": "2022-06-25T16:40:28.831883Z", "shell.execute_reply": "2022-06-25T16:40:28.831066Z", "shell.execute_reply.started": "2022-06-25T16:40:28.811646Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(206,)\n", "[False]\n", "[False]\n" ] } ], "source": [ "flag_html = df[\"flag_html\"]\n", "print(flag_html.shape)\n", "print(flag_html.isnull().unique())\n", "print(flag_html.isna().unique())" ] }, { "cell_type": "code", "execution_count": 22, "id": "643e6512-1e5b-4eb2-9f0a-6b680ada787b", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T16:40:24.737030Z", "iopub.status.busy": "2022-06-25T16:40:24.735571Z", "iopub.status.idle": "2022-06-25T16:40:24.746143Z", "shell.execute_reply": "2022-06-25T16:40:24.745147Z", "shell.execute_reply.started": "2022-06-25T16:40:24.737001Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [country_url, flag_description_url, short_country_name, country_html, flag_html, file_urls, files]\n", "Index: []" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df[\"flag_html\"].isnull()]" ] }, { "cell_type": "code", "execution_count": 21, "id": "5e21e98a-56ba-4e55-b5d4-89dab2232c29", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T16:40:22.482698Z", "iopub.status.busy": "2022-06-25T16:40:22.482421Z", "iopub.status.idle": "2022-06-25T16:40:22.535788Z", "shell.execute_reply": "2022-06-25T16:40:22.534754Z", "shell.execute_reply.started": "2022-06-25T16:40:22.482676Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [country_url, flag_description_url, short_country_name, country_html, flag_html, file_urls, files]\n", "Index: []" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df[\"flag_html\"].isna()]" ] }, { "cell_type": "code", "execution_count": 33, "id": "227b0c76-9e45-4849-849e-36355976cba9", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T16:41:54.100120Z", "iopub.status.busy": "2022-06-25T16:41:54.099765Z", "iopub.status.idle": "2022-06-25T16:41:54.105557Z", "shell.execute_reply": "2022-06-25T16:41:54.104656Z", "shell.execute_reply.started": "2022-06-25T16:41:54.100092Z" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "'Capital
and largest city
Maputo
25°57′S 32°35′E\\ufeff / \\ufeff25.950°S 32.583°E\\ufeff / -25.950; 32.583'" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[102][\"country_html\"][4]" ] }, { "cell_type": "code", "execution_count": 34, "id": "f7712d7d-9074-4fc5-89f2-6e5f47c57d20", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T16:42:32.237443Z", "iopub.status.busy": "2022-06-25T16:42:32.237093Z", "iopub.status.idle": "2022-06-25T16:42:32.246099Z", "shell.execute_reply": "2022-06-25T16:42:32.245261Z", "shell.execute_reply.started": "2022-06-25T16:42:32.237414Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "206\n" ] }, { "data": { "text/plain": [ "[]" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(len([file for file in df.files if len(file) != 0]))\n", "[file for file in df.files if len(file) == 0]" ] }, { "cell_type": "code", "execution_count": 42, "id": "d7e60156-1ee5-4bf9-ab9a-d529ee988301", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T16:45:32.482376Z", "iopub.status.busy": "2022-06-25T16:45:32.480141Z", "iopub.status.idle": "2022-06-25T16:45:32.514075Z", "shell.execute_reply": "2022-06-25T16:45:32.513181Z", "shell.execute_reply.started": "2022-06-25T16:45:32.482286Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
9https://en.wikipedia.org/wiki/Central_African_...https://en.wikipedia.org/wiki/Flag_of_the_Cent...Central_African_Republic[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <b>flag of the <a href=\"/wiki/Central_A...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
58https://en.wikipedia.org/wiki/South_Africahttps://en.wikipedia.org/wiki/Flag_of_South_Af...South_Africa[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <b>flag of South Africa</b> was designe...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
\n", "
" ], "text/plain": [ " country_url \\\n", "9 https://en.wikipedia.org/wiki/Central_African_... \n", "58 https://en.wikipedia.org/wiki/South_Africa \n", "\n", " flag_description_url \\\n", "9 https://en.wikipedia.org/wiki/Flag_of_the_Cent... \n", "58 https://en.wikipedia.org/wiki/Flag_of_South_Af... \n", "\n", " short_country_name \\\n", "9 Central_African_Republic \n", "58 South_Africa \n", "\n", " country_html \\\n", "9 [The flag of the The flag of South Africa was designe... \n", "\n", " file_urls \\\n", "9 [https:////upload.wikimedia.org/wikipedia/comm... \n", "58 [https:////upload.wikimedia.org/wikipedia/comm... \n", "\n", " files \n", "9 [{'url': 'https://upload.wikimedia.org/wikiped... \n", "58 [{'url': 'https://upload.wikimedia.org/wikiped... " ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df[\"short_country_name\"].map(lambda country: \"Africa\" in country)]" ] }, { "cell_type": "code", "execution_count": 51, "id": "97c1e41f-30f3-4116-aa11-5797e05b95ba", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T16:48:57.030064Z", "iopub.status.busy": "2022-06-25T16:48:57.029671Z", "iopub.status.idle": "2022-06-25T16:48:57.035861Z", "shell.execute_reply": "2022-06-25T16:48:57.035069Z", "shell.execute_reply.started": "2022-06-25T16:48:57.030033Z" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "'Capital
'" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[58][\"country_html\"][15]" ] }, { "cell_type": "code", "execution_count": null, "id": "2aff1e06-d054-40db-8203-7343ab914de9", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" }, "toc-autonumbering": true, "toc-showcode": false }, "nbformat": 4, "nbformat_minor": 5 }