{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d8185790-0793-4881-99e8-6730f95a8006",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T22:04:54.386982Z",
"iopub.status.busy": "2022-06-24T22:04:54.386313Z",
"iopub.status.idle": "2022-06-24T22:04:54.854521Z",
"shell.execute_reply": "2022-06-24T22:04:54.853581Z",
"shell.execute_reply.started": "2022-06-24T22:04:54.386910Z"
},
"tags": []
},
"outputs": [],
"source": [
"import json\n",
"import pathlib\n",
"\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "ea2b3e33-d58e-4e30-a0cc-8218a1f252c9",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T22:04:55.458615Z",
"iopub.status.busy": "2022-06-24T22:04:55.457695Z",
"iopub.status.idle": "2022-06-24T22:04:55.475878Z",
"shell.execute_reply": "2022-06-24T22:04:55.474706Z",
"shell.execute_reply.started": "2022-06-24T22:04:55.458548Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"[None]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd_options = {\n",
" \"display.max_rows\": None,\n",
"}\n",
"\n",
"[pd.set_option(option, value) for option, value in pd_options.items()]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "36149580-91d9-431d-99c3-51feee829e79",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T22:04:56.134416Z",
"iopub.status.busy": "2022-06-24T22:04:56.133745Z",
"iopub.status.idle": "2022-06-24T22:04:56.140326Z",
"shell.execute_reply": "2022-06-24T22:04:56.138507Z",
"shell.execute_reply.started": "2022-06-24T22:04:56.134371Z"
},
"tags": []
},
"outputs": [],
"source": [
"data_directory = (\n",
" pathlib.Path(\".\").resolve().parents[1] / \"data\" / \"scrapy\" / \"raw_country_data\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "d03be94e-8642-4916-8a43-1711e0c21b36",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T22:04:56.621163Z",
"iopub.status.busy": "2022-06-24T22:04:56.620692Z",
"iopub.status.idle": "2022-06-24T22:04:56.731001Z",
"shell.execute_reply": "2022-06-24T22:04:56.728392Z",
"shell.execute_reply.started": "2022-06-24T22:04:56.621128Z"
},
"tags": []
},
"outputs": [],
"source": [
"countries_file = data_directory / \"countries.json\"\n",
"countries = json.loads(countries_file.read_text())\n",
"# countries"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "29cca9ea-16d3-4534-8c9e-49fde37f8cdd",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T22:04:57.257218Z",
"iopub.status.busy": "2022-06-24T22:04:57.256573Z",
"iopub.status.idle": "2022-06-24T22:04:57.333032Z",
"shell.execute_reply": "2022-06-24T22:04:57.332120Z",
"shell.execute_reply.started": "2022-06-24T22:04:57.257174Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['country_url', 'flag_description_url', 'short_country_name',\n",
" 'country_html', 'flag_html', 'file_urls', 'files'],\n",
" dtype='object')"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_json(countries_file)\n",
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "ef8bc3ce-08dd-4260-807c-2616b2e1c1ba",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T22:04:59.223608Z",
"iopub.status.busy": "2022-06-24T22:04:59.222961Z",
"iopub.status.idle": "2022-06-24T22:04:59.229384Z",
"shell.execute_reply": "2022-06-24T22:04:59.228618Z",
"shell.execute_reply.started": "2022-06-24T22:04:59.223578Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(206,)\n",
"[False]\n",
"[False]\n"
]
}
],
"source": [
"country_url = df[\"country_url\"]\n",
"print(country_url.shape)\n",
"print(country_url.isnull().unique())\n",
"print(country_url.isna().unique())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "48db8f93-659b-45a4-8477-a7cec139bebc",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T22:04:59.710467Z",
"iopub.status.busy": "2022-06-24T22:04:59.709874Z",
"iopub.status.idle": "2022-06-24T22:04:59.720517Z",
"shell.execute_reply": "2022-06-24T22:04:59.717623Z",
"shell.execute_reply.started": "2022-06-24T22:04:59.710431Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(206,)\n",
"[False]\n",
"[False]\n"
]
}
],
"source": [
"short_country_name = df[\"short_country_name\"]\n",
"print(short_country_name.shape)\n",
"print(short_country_name.isnull().unique())\n",
"print(short_country_name.isna().unique())"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "a52f6aa2-5bbd-46e4-9b2f-cdbd7269cb6e",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T22:04:59.950051Z",
"iopub.status.busy": "2022-06-24T22:04:59.949622Z",
"iopub.status.idle": "2022-06-24T22:04:59.956484Z",
"shell.execute_reply": "2022-06-24T22:04:59.955471Z",
"shell.execute_reply.started": "2022-06-24T22:04:59.950016Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(206,)\n",
"[False]\n",
"[False]\n"
]
}
],
"source": [
"flag_html = df[\"flag_html\"]\n",
"print(flag_html.shape)\n",
"print(flag_html.isnull().unique())\n",
"print(flag_html.isna().unique())"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "643e6512-1e5b-4eb2-9f0a-6b680ada787b",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T22:05:00.166633Z",
"iopub.status.busy": "2022-06-24T22:05:00.166278Z",
"iopub.status.idle": "2022-06-24T22:05:00.178277Z",
"shell.execute_reply": "2022-06-24T22:05:00.177378Z",
"shell.execute_reply.started": "2022-06-24T22:05:00.166609Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" country_url | \n",
" flag_description_url | \n",
" short_country_name | \n",
" country_html | \n",
" flag_html | \n",
" file_urls | \n",
" files | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [country_url, flag_description_url, short_country_name, country_html, flag_html, file_urls, files]\n",
"Index: []"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df[\"flag_html\"].isnull()]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "5e21e98a-56ba-4e55-b5d4-89dab2232c29",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T22:05:00.714817Z",
"iopub.status.busy": "2022-06-24T22:05:00.714232Z",
"iopub.status.idle": "2022-06-24T22:05:00.728680Z",
"shell.execute_reply": "2022-06-24T22:05:00.727307Z",
"shell.execute_reply.started": "2022-06-24T22:05:00.714774Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" country_url | \n",
" flag_description_url | \n",
" short_country_name | \n",
" country_html | \n",
" flag_html | \n",
" file_urls | \n",
" files | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [country_url, flag_description_url, short_country_name, country_html, flag_html, file_urls, files]\n",
"Index: []"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df[\"flag_html\"].isna()]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "227b0c76-9e45-4849-849e-36355976cba9",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T22:13:34.716780Z",
"iopub.status.busy": "2022-06-24T22:13:34.716226Z",
"iopub.status.idle": "2022-06-24T22:13:34.734266Z",
"shell.execute_reply": "2022-06-24T22:13:34.733297Z",
"shell.execute_reply.started": "2022-06-24T22:13:34.716742Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"country_url https://en.wikipedia.org/wiki/Paraguay\n",
"flag_description_url https://en.wikipedia.org/wiki/Flag_of_Paraguay\n",
"short_country_name Paraguay\n",
"country_html [| The flag of | Islamic Emirate of Afghanistan - د افغانستان اسلامي امارت\\xa0(Pashto)
Də Afġānistān Islāmī Imārat - امارت اسلامی افغانستان\\xa0(Dari)
Imārat-i Islāmī-yi Afghānistān
|
|---|
|
Anthem:\\xa0دا د باتورانو کور Dā Də Bātorāno Kor \"This is the Home of the Brave\"[2] |
Afghanistan on the globe Map of Afghanistan |
| Status | UN member state under an unrecognized government[3] |
|---|
| Capital and largest city | Kabul 34°31′N 69°11′E\\ufeff / \\ufeff34.517°N 69.183°E\\ufeff / 34.517; 69.183Coordinates: 34°31′N 69°11′E\\ufeff / \\ufeff34.517°N 69.183°E\\ufeff / 34.517; 69.183[4] |
|---|
| Major languages | |
|---|
| Ethnic\\xa0groups | |
|---|
| Religion | |
|---|
| Demonym(s) | Afghan[b][13][14] |
|---|
| Government | Unitary provisional theocratic Islamic emirate[15][16][17] |
|---|
|
| Hibatullah Akhundzada |
|---|
| Hasan Akhund (acting) |
|---|
| Abdul Hakim Ishaqzai |
|---|
| \\n |
| Legislature | Leadership Council (consultative body)[18] |
|---|
|
|
| 1709–1738 |
|---|
| 1747–1823 |
|---|
| 1823–1839 |
|---|
| 1839–1842 |
|---|
| 1842–1926 |
|---|
| 27 May 1863 |
|---|
| 26 May 1879 |
|---|
| 19 August 1919 |
|---|
| 9 June 1926 |
|---|
| 17 July 1973 |
|---|
| 27–28 April 1978 |
|---|
| 28 April 1992 |
|---|
| 27 September 1996 |
|---|
| 26 January 2004 |
|---|
| 15 August 2021 |
|---|
| \\n |
|
•\\xa0Total | 652,867[19]\\xa0km2 (252,073\\xa0sq\\xa0mi) (40th) |
|---|
•\\xa0Water\\xa0(%) | negligible |
|---|
|
•\\xa02021 estimate | 40,218,234[7] (37th) |
|---|
•\\xa0Density | 48.08/km2 (124.5/sq\\xa0mi) (174th) |
|---|
| GDP\\xa0(PPP) | 2018\\xa0estimate |
|---|
•\\xa0Total | $72.911\\xa0billion[20] (96th) |
|---|
•\\xa0Per capita | $2,024[20] (169th) |
|---|
| GDP\\xa0(nominal) | 2018\\xa0estimate |
|---|
•\\xa0Total | $21.657\\xa0billion[20] (111st) |
|---|
•\\xa0Per capita | $493[20] (177th) |
|---|
| HDI\\xa0(2019) | \\xa00.511[21] low\\xa0·\\xa0169th |
|---|
| Currency | Afghani (افغانی) (AFN) |
|---|
| Time zone | UTC+4:30 Solar Calendar (D†) |
|---|
| Driving side | right |
|---|
| Calling code | +93 |
|---|
| ISO 3166 code | AF |
|---|
| Internet TLD | .af افغانستان. |
'"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(df.iloc[0].country_html)\n",
"content = \"\".join(df.iloc[0].country_html)\n",
"content"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "97c1e41f-30f3-4116-aa11-5797e05b95ba",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
},
"toc-autonumbering": true,
"toc-showcode": false
},
"nbformat": 4,
"nbformat_minor": 5
}