{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "d8185790-0793-4881-99e8-6730f95a8006", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T22:04:54.386982Z", "iopub.status.busy": "2022-06-24T22:04:54.386313Z", "iopub.status.idle": "2022-06-24T22:04:54.854521Z", "shell.execute_reply": "2022-06-24T22:04:54.853581Z", "shell.execute_reply.started": "2022-06-24T22:04:54.386910Z" }, "tags": [] }, "outputs": [], "source": [ "import json\n", "import pathlib\n", "\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "id": "ea2b3e33-d58e-4e30-a0cc-8218a1f252c9", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T22:04:55.458615Z", "iopub.status.busy": "2022-06-24T22:04:55.457695Z", "iopub.status.idle": "2022-06-24T22:04:55.475878Z", "shell.execute_reply": "2022-06-24T22:04:55.474706Z", "shell.execute_reply.started": "2022-06-24T22:04:55.458548Z" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "[None]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_options = {\n", " \"display.max_rows\": None,\n", "}\n", "\n", "[pd.set_option(option, value) for option, value in pd_options.items()]" ] }, { "cell_type": "code", "execution_count": 3, "id": "36149580-91d9-431d-99c3-51feee829e79", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T22:04:56.134416Z", "iopub.status.busy": "2022-06-24T22:04:56.133745Z", "iopub.status.idle": "2022-06-24T22:04:56.140326Z", "shell.execute_reply": "2022-06-24T22:04:56.138507Z", "shell.execute_reply.started": "2022-06-24T22:04:56.134371Z" }, "tags": [] }, "outputs": [], "source": [ "data_directory = (\n", " pathlib.Path(\".\").resolve().parents[1] / \"data\" / \"scrapy\" / \"raw_country_data\"\n", ")" ] }, { "cell_type": "code", "execution_count": 4, "id": "d03be94e-8642-4916-8a43-1711e0c21b36", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T22:04:56.621163Z", "iopub.status.busy": "2022-06-24T22:04:56.620692Z", "iopub.status.idle": "2022-06-24T22:04:56.731001Z", "shell.execute_reply": "2022-06-24T22:04:56.728392Z", "shell.execute_reply.started": "2022-06-24T22:04:56.621128Z" }, "tags": [] }, "outputs": [], "source": [ "countries_file = data_directory / \"countries.json\"\n", "countries = json.loads(countries_file.read_text())\n", "# countries" ] }, { "cell_type": "code", "execution_count": 5, "id": "29cca9ea-16d3-4534-8c9e-49fde37f8cdd", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T22:04:57.257218Z", "iopub.status.busy": "2022-06-24T22:04:57.256573Z", "iopub.status.idle": "2022-06-24T22:04:57.333032Z", "shell.execute_reply": "2022-06-24T22:04:57.332120Z", "shell.execute_reply.started": "2022-06-24T22:04:57.257174Z" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "Index(['country_url', 'flag_description_url', 'short_country_name',\n", " 'country_html', 'flag_html', 'file_urls', 'files'],\n", " dtype='object')" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_json(countries_file)\n", "df.columns" ] }, { "cell_type": "code", "execution_count": 6, "id": "ef8bc3ce-08dd-4260-807c-2616b2e1c1ba", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T22:04:59.223608Z", "iopub.status.busy": "2022-06-24T22:04:59.222961Z", "iopub.status.idle": "2022-06-24T22:04:59.229384Z", "shell.execute_reply": "2022-06-24T22:04:59.228618Z", "shell.execute_reply.started": "2022-06-24T22:04:59.223578Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(206,)\n", "[False]\n", "[False]\n" ] } ], "source": [ "country_url = df[\"country_url\"]\n", "print(country_url.shape)\n", "print(country_url.isnull().unique())\n", "print(country_url.isna().unique())" ] }, { "cell_type": "code", "execution_count": 7, "id": "48db8f93-659b-45a4-8477-a7cec139bebc", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T22:04:59.710467Z", "iopub.status.busy": "2022-06-24T22:04:59.709874Z", "iopub.status.idle": "2022-06-24T22:04:59.720517Z", "shell.execute_reply": "2022-06-24T22:04:59.717623Z", "shell.execute_reply.started": "2022-06-24T22:04:59.710431Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(206,)\n", "[False]\n", "[False]\n" ] } ], "source": [ "short_country_name = df[\"short_country_name\"]\n", "print(short_country_name.shape)\n", "print(short_country_name.isnull().unique())\n", "print(short_country_name.isna().unique())" ] }, { "cell_type": "code", "execution_count": 8, "id": "a52f6aa2-5bbd-46e4-9b2f-cdbd7269cb6e", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T22:04:59.950051Z", "iopub.status.busy": "2022-06-24T22:04:59.949622Z", "iopub.status.idle": "2022-06-24T22:04:59.956484Z", "shell.execute_reply": "2022-06-24T22:04:59.955471Z", "shell.execute_reply.started": "2022-06-24T22:04:59.950016Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(206,)\n", "[False]\n", "[False]\n" ] } ], "source": [ "flag_html = df[\"flag_html\"]\n", "print(flag_html.shape)\n", "print(flag_html.isnull().unique())\n", "print(flag_html.isna().unique())" ] }, { "cell_type": "code", "execution_count": 9, "id": "643e6512-1e5b-4eb2-9f0a-6b680ada787b", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T22:05:00.166633Z", "iopub.status.busy": "2022-06-24T22:05:00.166278Z", "iopub.status.idle": "2022-06-24T22:05:00.178277Z", "shell.execute_reply": "2022-06-24T22:05:00.177378Z", "shell.execute_reply.started": "2022-06-24T22:05:00.166609Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [country_url, flag_description_url, short_country_name, country_html, flag_html, file_urls, files]\n", "Index: []" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df[\"flag_html\"].isnull()]" ] }, { "cell_type": "code", "execution_count": 10, "id": "5e21e98a-56ba-4e55-b5d4-89dab2232c29", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T22:05:00.714817Z", "iopub.status.busy": "2022-06-24T22:05:00.714232Z", "iopub.status.idle": "2022-06-24T22:05:00.728680Z", "shell.execute_reply": "2022-06-24T22:05:00.727307Z", "shell.execute_reply.started": "2022-06-24T22:05:00.714774Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [country_url, flag_description_url, short_country_name, country_html, flag_html, file_urls, files]\n", "Index: []" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df[\"flag_html\"].isna()]" ] }, { "cell_type": "code", "execution_count": 18, "id": "227b0c76-9e45-4849-849e-36355976cba9", "metadata": { "execution": { "iopub.execute_input": "2022-06-24T22:13:34.716780Z", "iopub.status.busy": "2022-06-24T22:13:34.716226Z", "iopub.status.idle": "2022-06-24T22:13:34.734266Z", "shell.execute_reply": "2022-06-24T22:13:34.733297Z", "shell.execute_reply.started": "2022-06-24T22:13:34.716742Z" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "country_url https://en.wikipedia.org/wiki/Paraguay\n", "flag_description_url https://en.wikipedia.org/wiki/Flag_of_Paraguay\n", "short_country_name Paraguay\n", "country_html [The flag of
Islamic Emirate of Afghanistan
\\n
\\n
\"Flag
\\n
Flag
\\n
\\n
\\n
\"Coat
\\n
Emblem
\\n
\\n
Anthem:\\xa0دا د باتورانو کور
Dā Də Bātorāno Kor
\"This is the Home of the Brave\"[2]
\"AfghanistanAfghanistan on the globe
\"AfghanistanMap of Afghanistan
StatusUN member state under an unrecognized government[3]Capital
and largest city
Kabul
34°31′N 69°11′E\\ufeff / \\ufeff34.517°N 69.183°E\\ufeff / 34.517; 69.183Coordinates: 34°31′N 69°11′E\\ufeff / \\ufeff34.517°N 69.183°E\\ufeff / 34.517; 69.183[4]Major languages
Ethnic\\xa0groups
(2019 unofficial estimates)[a][6][7][8][9]
Religion
Demonym(s)Afghan[b][13][14]GovernmentUnitary provisional theocratic Islamic emirate[15][16][17]
•\\xa0Leader
Hibatullah Akhundzada
•\\xa0Prime Minister
Hasan Akhund (acting)
•\\xa0Chief Justice
Abdul Hakim Ishaqzai\\nLegislatureLeadership Council (consultative body)[18]Formation
•\\xa0Hotak Empire
17091738
•\\xa0Durrani Empire
1747–1823
•\\xa0Emirate
1823–1839
•\\xa0Restoration of the Durrani Kingdom
1839–1842
•\\xa0Restoration of the Emirate
1842–1926
•\\xa0Dost Mohammad unites Afghanistan
27 May 1863
•\\xa0Anglo-Afghan Agreement
26 May 1879
•\\xa0Independence
19 August 1919
•\\xa0Kingdom
9 June 1926
•\\xa0Republic
17 July 1973
•\\xa0Democratic Republic
27–28 April 1978
•\\xa0Islamic State
28 April 1992
•\\xa0Islamic Emirate
27 September 1996
•\\xa0Islamic Republic
26 January 2004
•\\xa0Restoration of Islamic Emirate
15 August 2021\\nArea
•\\xa0Total
652,867[19]\\xa0km2 (252,073\\xa0sq\\xa0mi) (40th)
•\\xa0Water\\xa0(%)
negligiblePopulation
•\\xa02021 estimate
40,218,234[7] (37th)
•\\xa0Density
48.08/km2 (124.5/sq\\xa0mi) (174th)GDP\\xa0(PPP)2018\\xa0estimate
•\\xa0Total
$72.911\\xa0billion[20] (96th)
•\\xa0Per capita
$2,024[20] (169th)GDP\\xa0(nominal)2018\\xa0estimate
•\\xa0Total
$21.657\\xa0billion[20] (111st)
•\\xa0Per capita
$493[20] (177th)HDI\\xa0(2019)\"Increase\"\\xa00.511[21]
low\\xa0·\\xa0169thCurrencyAfghani (افغانی) (AFN)Time zoneUTC+4:30
Solar Calendar
(D†)Driving siderightCalling code+93ISO 3166 codeAFInternet TLD.af
افغانستان.'" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(df.iloc[0].country_html)\n", "content = \"\".join(df.iloc[0].country_html)\n", "content" ] }, { "cell_type": "code", "execution_count": null, "id": "97c1e41f-30f3-4116-aa11-5797e05b95ba", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" }, "toc-autonumbering": true, "toc-showcode": false }, "nbformat": 4, "nbformat_minor": 5 }