487 lines
14 KiB
Plaintext
487 lines
14 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 38,
|
|
"id": "d8185790-0793-4881-99e8-6730f95a8006",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2022-06-24T21:31:57.297266Z",
|
|
"iopub.status.busy": "2022-06-24T21:31:57.284090Z",
|
|
"iopub.status.idle": "2022-06-24T21:31:57.366471Z",
|
|
"shell.execute_reply": "2022-06-24T21:31:57.365193Z",
|
|
"shell.execute_reply.started": "2022-06-24T21:31:57.293844Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import json\n",
|
|
"import pathlib\n",
|
|
"\n",
|
|
"import pandas as pd"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 39,
|
|
"id": "ea2b3e33-d58e-4e30-a0cc-8218a1f252c9",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2022-06-24T21:31:58.025200Z",
|
|
"iopub.status.busy": "2022-06-24T21:31:58.024201Z",
|
|
"iopub.status.idle": "2022-06-24T21:31:58.108904Z",
|
|
"shell.execute_reply": "2022-06-24T21:31:58.107402Z",
|
|
"shell.execute_reply.started": "2022-06-24T21:31:58.025121Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"[None]"
|
|
]
|
|
},
|
|
"execution_count": 39,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"pd_options = {\n",
|
|
" \"display.max_rows\": None,\n",
|
|
"}\n",
|
|
"\n",
|
|
"[pd.set_option(option, value) for option, value in pd_options.items()]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 40,
|
|
"id": "36149580-91d9-431d-99c3-51feee829e79",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2022-06-24T21:31:58.650508Z",
|
|
"iopub.status.busy": "2022-06-24T21:31:58.650001Z",
|
|
"iopub.status.idle": "2022-06-24T21:31:58.670264Z",
|
|
"shell.execute_reply": "2022-06-24T21:31:58.669296Z",
|
|
"shell.execute_reply.started": "2022-06-24T21:31:58.650473Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"data_directory = (\n",
|
|
" pathlib.Path(\".\").resolve().parents[1] / \"data\" / \"scrapy\" / \"raw_country_data\"\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 74,
|
|
"id": "d03be94e-8642-4916-8a43-1711e0c21b36",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2022-06-24T21:52:35.726961Z",
|
|
"iopub.status.busy": "2022-06-24T21:52:35.726356Z",
|
|
"iopub.status.idle": "2022-06-24T21:52:35.823456Z",
|
|
"shell.execute_reply": "2022-06-24T21:52:35.822464Z",
|
|
"shell.execute_reply.started": "2022-06-24T21:52:35.726932Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"countries_file = data_directory / \"countries.json\"\n",
|
|
"countries = json.loads(countries_file.read_text())\n",
|
|
"# countries"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 75,
|
|
"id": "29cca9ea-16d3-4534-8c9e-49fde37f8cdd",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2022-06-24T21:52:36.453655Z",
|
|
"iopub.status.busy": "2022-06-24T21:52:36.452202Z",
|
|
"iopub.status.idle": "2022-06-24T21:52:36.555461Z",
|
|
"shell.execute_reply": "2022-06-24T21:52:36.554096Z",
|
|
"shell.execute_reply.started": "2022-06-24T21:52:36.453559Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = pd.read_json(countries_file)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 76,
|
|
"id": "ef8bc3ce-08dd-4260-807c-2616b2e1c1ba",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2022-06-24T21:52:36.950435Z",
|
|
"iopub.status.busy": "2022-06-24T21:52:36.949946Z",
|
|
"iopub.status.idle": "2022-06-24T21:52:36.959935Z",
|
|
"shell.execute_reply": "2022-06-24T21:52:36.958581Z",
|
|
"shell.execute_reply.started": "2022-06-24T21:52:36.950398Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"(206,)\n",
|
|
"[False]\n",
|
|
"[False]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"country_url = df[\"country_url\"]\n",
|
|
"print(country_url.shape)\n",
|
|
"print(country_url.isnull().unique())\n",
|
|
"print(country_url.isna().unique())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 77,
|
|
"id": "48db8f93-659b-45a4-8477-a7cec139bebc",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2022-06-24T21:52:37.635010Z",
|
|
"iopub.status.busy": "2022-06-24T21:52:37.634417Z",
|
|
"iopub.status.idle": "2022-06-24T21:52:37.645162Z",
|
|
"shell.execute_reply": "2022-06-24T21:52:37.643796Z",
|
|
"shell.execute_reply.started": "2022-06-24T21:52:37.634953Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"(206,)\n",
|
|
"[False]\n",
|
|
"[False]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"short_country_name = df[\"short_country_name\"]\n",
|
|
"print(short_country_name.shape)\n",
|
|
"print(short_country_name.isnull().unique())\n",
|
|
"print(short_country_name.isna().unique())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 81,
|
|
"id": "a52f6aa2-5bbd-46e4-9b2f-cdbd7269cb6e",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2022-06-24T21:52:49.070882Z",
|
|
"iopub.status.busy": "2022-06-24T21:52:49.070107Z",
|
|
"iopub.status.idle": "2022-06-24T21:52:49.076033Z",
|
|
"shell.execute_reply": "2022-06-24T21:52:49.075243Z",
|
|
"shell.execute_reply.started": "2022-06-24T21:52:49.070853Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"(206,)\n",
|
|
"[False True]\n",
|
|
"[False True]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"flag_html = df[\"flag_html\"]\n",
|
|
"print(flag_html.shape)\n",
|
|
"print(flag_html.isnull().unique())\n",
|
|
"print(flag_html.isna().unique())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 84,
|
|
"id": "643e6512-1e5b-4eb2-9f0a-6b680ada787b",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2022-06-24T21:53:15.797108Z",
|
|
"iopub.status.busy": "2022-06-24T21:53:15.796761Z",
|
|
"iopub.status.idle": "2022-06-24T21:53:15.809904Z",
|
|
"shell.execute_reply": "2022-06-24T21:53:15.809157Z",
|
|
"shell.execute_reply.started": "2022-06-24T21:53:15.797079Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>country_url</th>\n",
|
|
" <th>flag_image_url</th>\n",
|
|
" <th>short_country_name</th>\n",
|
|
" <th>country_html</th>\n",
|
|
" <th>flag_html</th>\n",
|
|
" <th>file_urls</th>\n",
|
|
" <th>files</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>84</th>\n",
|
|
" <td>https://en.wikipedia.org/wiki/Paraguay</td>\n",
|
|
" <td>https://en.wikipedia.org/wiki/File:Flag_of_Par...</td>\n",
|
|
" <td>Paraguay</td>\n",
|
|
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
|
" <td>None</td>\n",
|
|
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
|
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" country_url \\\n",
|
|
"84 https://en.wikipedia.org/wiki/Paraguay \n",
|
|
"\n",
|
|
" flag_image_url short_country_name \\\n",
|
|
"84 https://en.wikipedia.org/wiki/File:Flag_of_Par... Paraguay \n",
|
|
"\n",
|
|
" country_html flag_html \\\n",
|
|
"84 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... None \n",
|
|
"\n",
|
|
" file_urls \\\n",
|
|
"84 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
|
"\n",
|
|
" files \n",
|
|
"84 [{'url': 'https://upload.wikimedia.org/wikiped... "
|
|
]
|
|
},
|
|
"execution_count": 84,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df[df[\"flag_html\"].isnull()]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 85,
|
|
"id": "5e21e98a-56ba-4e55-b5d4-89dab2232c29",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2022-06-24T21:53:31.685434Z",
|
|
"iopub.status.busy": "2022-06-24T21:53:31.684830Z",
|
|
"iopub.status.idle": "2022-06-24T21:53:31.697841Z",
|
|
"shell.execute_reply": "2022-06-24T21:53:31.697000Z",
|
|
"shell.execute_reply.started": "2022-06-24T21:53:31.685404Z"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>country_url</th>\n",
|
|
" <th>flag_image_url</th>\n",
|
|
" <th>short_country_name</th>\n",
|
|
" <th>country_html</th>\n",
|
|
" <th>flag_html</th>\n",
|
|
" <th>file_urls</th>\n",
|
|
" <th>files</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>84</th>\n",
|
|
" <td>https://en.wikipedia.org/wiki/Paraguay</td>\n",
|
|
" <td>https://en.wikipedia.org/wiki/File:Flag_of_Par...</td>\n",
|
|
" <td>Paraguay</td>\n",
|
|
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
|
" <td>None</td>\n",
|
|
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
|
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" country_url \\\n",
|
|
"84 https://en.wikipedia.org/wiki/Paraguay \n",
|
|
"\n",
|
|
" flag_image_url short_country_name \\\n",
|
|
"84 https://en.wikipedia.org/wiki/File:Flag_of_Par... Paraguay \n",
|
|
"\n",
|
|
" country_html flag_html \\\n",
|
|
"84 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... None \n",
|
|
"\n",
|
|
" file_urls \\\n",
|
|
"84 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
|
"\n",
|
|
" files \n",
|
|
"84 [{'url': 'https://upload.wikimedia.org/wikiped... "
|
|
]
|
|
},
|
|
"execution_count": 85,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df[df[\"flag_html\"].isna()]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 89,
|
|
"id": "227b0c76-9e45-4849-849e-36355976cba9",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2022-06-24T21:54:13.588753Z",
|
|
"iopub.status.busy": "2022-06-24T21:54:13.588402Z",
|
|
"iopub.status.idle": "2022-06-24T21:54:13.594182Z",
|
|
"shell.execute_reply": "2022-06-24T21:54:13.593418Z",
|
|
"shell.execute_reply.started": "2022-06-24T21:54:13.588723Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'https://en.wikipedia.org/wiki/File:Flag_of_Paraguay.svg'"
|
|
]
|
|
},
|
|
"execution_count": 89,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df.iloc[84][\"flag_image_url\"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 87,
|
|
"id": "f7712d7d-9074-4fc5-89f2-6e5f47c57d20",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2022-06-24T21:53:55.941453Z",
|
|
"iopub.status.busy": "2022-06-24T21:53:55.940811Z",
|
|
"iopub.status.idle": "2022-06-24T21:53:55.947972Z",
|
|
"shell.execute_reply": "2022-06-24T21:53:55.947077Z",
|
|
"shell.execute_reply.started": "2022-06-24T21:53:55.941423Z"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"206\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"[]"
|
|
]
|
|
},
|
|
"execution_count": 87,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"print(len([file for file in df.files if len(file) != 0]))\n",
|
|
"[file for file in df.files if len(file) == 0]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d7e60156-1ee5-4bf9-ab9a-d529ee988301",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.12"
|
|
},
|
|
"toc-autonumbering": true,
|
|
"toc-showcode": false
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|