2035 lines
81 KiB
Plaintext
2035 lines
81 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "a1e7252d-16d9-4b68-a855-d94a89132291",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Capital extraction"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "e4bb164b-f8a7-4d21-86a0-618ca78e9386",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T20:44:25.083012Z",
|
||
"iopub.status.busy": "2022-06-25T20:44:25.082584Z",
|
||
"iopub.status.idle": "2022-06-25T20:44:25.400728Z",
|
||
"shell.execute_reply": "2022-06-25T20:44:25.399967Z",
|
||
"shell.execute_reply.started": "2022-06-25T20:44:25.082926Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import json\n",
|
||
"import pathlib\n",
|
||
"import re\n",
|
||
"import xml.etree.ElementTree as ET\n",
|
||
"\n",
|
||
"import pandas as pd\n",
|
||
"from lxml import etree"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "9fd69c5d-2c4d-49c8-a042-82eade1d6ab7",
|
||
"metadata": {},
|
||
"source": [
|
||
"## load data\n",
|
||
"\n",
|
||
"### load the raw countries data"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "bfe405bb-7879-4f5f-85df-7215c5e8a4b8",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T20:44:25.402295Z",
|
||
"iopub.status.busy": "2022-06-25T20:44:25.401710Z",
|
||
"iopub.status.idle": "2022-06-25T20:44:25.406212Z",
|
||
"shell.execute_reply": "2022-06-25T20:44:25.405622Z",
|
||
"shell.execute_reply.started": "2022-06-25T20:44:25.402274Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"data_directory = (\n",
|
||
" pathlib.Path(\".\").resolve().parents[1] / \"data\" / \"scrapy\" / \"raw_country_data\"\n",
|
||
")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"id": "49fdb7c8-b9db-4d0a-8d2b-d4a0a3ddcdd9",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T20:45:11.573098Z",
|
||
"iopub.status.busy": "2022-06-25T20:45:11.572720Z",
|
||
"iopub.status.idle": "2022-06-25T20:45:11.650803Z",
|
||
"shell.execute_reply": "2022-06-25T20:45:11.650139Z",
|
||
"shell.execute_reply.started": "2022-06-25T20:45:11.573067Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>country_url</th>\n",
|
||
" <th>flag_description_url</th>\n",
|
||
" <th>short_country_name</th>\n",
|
||
" <th>country_html</th>\n",
|
||
" <th>flag_html</th>\n",
|
||
" <th>file_urls</th>\n",
|
||
" <th>files</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Afghanistan</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Afghanistan</td>\n",
|
||
" <td>Afghanistan</td>\n",
|
||
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
||
" <td><p>The <a href=\"/wiki/National_flag\" title=\"Na...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Croatia</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Croatia</td>\n",
|
||
" <td>Croatia</td>\n",
|
||
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
||
" <td><p>The <a href=\"/wiki/National_flag\" title=\"Na...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Costa_Rica</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Costa_Rica</td>\n",
|
||
" <td>Costa_Rica</td>\n",
|
||
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
||
" <td><p>The <b><a href=\"/wiki/National_flag\" title=...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Democratic_Repub...</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_the_Demo...</td>\n",
|
||
" <td>Democratic_Republic_of_the_Congo</td>\n",
|
||
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
||
" <td><p>The <a href=\"/wiki/National_flag\" title=\"Na...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Comoros</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Comoros</td>\n",
|
||
" <td>Comoros</td>\n",
|
||
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
||
" <td><p>The <b><a href=\"/wiki/National_flag\" title=...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>201</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Antigua_and_Barbuda</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Antigua_...</td>\n",
|
||
" <td>Antigua_and_Barbuda</td>\n",
|
||
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
||
" <td><p>The <a href=\"/wiki/National_flag\" title=\"Na...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>202</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Angola</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Angola</td>\n",
|
||
" <td>Angola</td>\n",
|
||
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
||
" <td><p>The <a href=\"/wiki/National_flag\" title=\"Na...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>203</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Andorra</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Andorra</td>\n",
|
||
" <td>Andorra</td>\n",
|
||
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
||
" <td><p>The <a href=\"/wiki/National_flag\" title=\"Na...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>204</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Algeria</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Algeria</td>\n",
|
||
" <td>Algeria</td>\n",
|
||
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
||
" <td><p>The <a href=\"/wiki/National_flag\" title=\"Na...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>205</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Albania</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Albania</td>\n",
|
||
" <td>Albania</td>\n",
|
||
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
||
" <td><p>The <b>flag of Albania</b> (<a href=\"/wiki/...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>206 rows × 7 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" country_url \\\n",
|
||
"0 https://en.wikipedia.org/wiki/Afghanistan \n",
|
||
"1 https://en.wikipedia.org/wiki/Croatia \n",
|
||
"2 https://en.wikipedia.org/wiki/Costa_Rica \n",
|
||
"3 https://en.wikipedia.org/wiki/Democratic_Repub... \n",
|
||
"4 https://en.wikipedia.org/wiki/Comoros \n",
|
||
".. ... \n",
|
||
"201 https://en.wikipedia.org/wiki/Antigua_and_Barbuda \n",
|
||
"202 https://en.wikipedia.org/wiki/Angola \n",
|
||
"203 https://en.wikipedia.org/wiki/Andorra \n",
|
||
"204 https://en.wikipedia.org/wiki/Algeria \n",
|
||
"205 https://en.wikipedia.org/wiki/Albania \n",
|
||
"\n",
|
||
" flag_description_url \\\n",
|
||
"0 https://en.wikipedia.org/wiki/Flag_of_Afghanistan \n",
|
||
"1 https://en.wikipedia.org/wiki/Flag_of_Croatia \n",
|
||
"2 https://en.wikipedia.org/wiki/Flag_of_Costa_Rica \n",
|
||
"3 https://en.wikipedia.org/wiki/Flag_of_the_Demo... \n",
|
||
"4 https://en.wikipedia.org/wiki/Flag_of_Comoros \n",
|
||
".. ... \n",
|
||
"201 https://en.wikipedia.org/wiki/Flag_of_Antigua_... \n",
|
||
"202 https://en.wikipedia.org/wiki/Flag_of_Angola \n",
|
||
"203 https://en.wikipedia.org/wiki/Flag_of_Andorra \n",
|
||
"204 https://en.wikipedia.org/wiki/Flag_of_Algeria \n",
|
||
"205 https://en.wikipedia.org/wiki/Flag_of_Albania \n",
|
||
"\n",
|
||
" short_country_name \\\n",
|
||
"0 Afghanistan \n",
|
||
"1 Croatia \n",
|
||
"2 Costa_Rica \n",
|
||
"3 Democratic_Republic_of_the_Congo \n",
|
||
"4 Comoros \n",
|
||
".. ... \n",
|
||
"201 Antigua_and_Barbuda \n",
|
||
"202 Angola \n",
|
||
"203 Andorra \n",
|
||
"204 Algeria \n",
|
||
"205 Albania \n",
|
||
"\n",
|
||
" country_html \\\n",
|
||
"0 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... \n",
|
||
"1 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... \n",
|
||
"2 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... \n",
|
||
"3 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... \n",
|
||
"4 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... \n",
|
||
".. ... \n",
|
||
"201 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... \n",
|
||
"202 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... \n",
|
||
"203 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... \n",
|
||
"204 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... \n",
|
||
"205 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... \n",
|
||
"\n",
|
||
" flag_html \\\n",
|
||
"0 <p>The <a href=\"/wiki/National_flag\" title=\"Na... \n",
|
||
"1 <p>The <a href=\"/wiki/National_flag\" title=\"Na... \n",
|
||
"2 <p>The <b><a href=\"/wiki/National_flag\" title=... \n",
|
||
"3 <p>The <a href=\"/wiki/National_flag\" title=\"Na... \n",
|
||
"4 <p>The <b><a href=\"/wiki/National_flag\" title=... \n",
|
||
".. ... \n",
|
||
"201 <p>The <a href=\"/wiki/National_flag\" title=\"Na... \n",
|
||
"202 <p>The <a href=\"/wiki/National_flag\" title=\"Na... \n",
|
||
"203 <p>The <a href=\"/wiki/National_flag\" title=\"Na... \n",
|
||
"204 <p>The <a href=\"/wiki/National_flag\" title=\"Na... \n",
|
||
"205 <p>The <b>flag of Albania</b> (<a href=\"/wiki/... \n",
|
||
"\n",
|
||
" file_urls \\\n",
|
||
"0 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"1 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"2 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"3 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"4 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
".. ... \n",
|
||
"201 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"202 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"203 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"204 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"205 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"\n",
|
||
" files \n",
|
||
"0 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"1 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"2 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"3 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"4 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
".. ... \n",
|
||
"201 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"202 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"203 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"204 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"205 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"\n",
|
||
"[206 rows x 7 columns]"
|
||
]
|
||
},
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"countries = pd.read_json(data_directory / \"countries.json\")\n",
|
||
"countries"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "7e7398db-96ea-44e7-a13c-b075e8aecc30",
|
||
"metadata": {},
|
||
"source": [
|
||
"## cleaning\n",
|
||
"\n",
|
||
"### clean `short_country_name`"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"id": "c2eb9f21-5614-4d87-bf54-b9583160da93",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T20:45:13.187066Z",
|
||
"iopub.status.busy": "2022-06-25T20:45:13.186714Z",
|
||
"iopub.status.idle": "2022-06-25T20:45:13.191480Z",
|
||
"shell.execute_reply": "2022-06-25T20:45:13.190663Z",
|
||
"shell.execute_reply.started": "2022-06-25T20:45:13.187037Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"countries[\"short_country_name\"] = countries[\"short_country_name\"].map(\n",
|
||
" lambda country: country.replace(\"_\", \" \")\n",
|
||
")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"id": "29e35334-b5fe-4851-81c1-4150f0ed8b00",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T20:45:13.645989Z",
|
||
"iopub.status.busy": "2022-06-25T20:45:13.645391Z",
|
||
"iopub.status.idle": "2022-06-25T20:45:13.689080Z",
|
||
"shell.execute_reply": "2022-06-25T20:45:13.688077Z",
|
||
"shell.execute_reply.started": "2022-06-25T20:45:13.645946Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>country_url</th>\n",
|
||
" <th>flag_description_url</th>\n",
|
||
" <th>short_country_name</th>\n",
|
||
" <th>country_html</th>\n",
|
||
" <th>flag_html</th>\n",
|
||
" <th>file_urls</th>\n",
|
||
" <th>files</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Afghanistan</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Afghanistan</td>\n",
|
||
" <td>Afghanistan</td>\n",
|
||
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
||
" <td><p>The <a href=\"/wiki/National_flag\" title=\"Na...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Croatia</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Croatia</td>\n",
|
||
" <td>Croatia</td>\n",
|
||
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
||
" <td><p>The <a href=\"/wiki/National_flag\" title=\"Na...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Costa_Rica</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Costa_Rica</td>\n",
|
||
" <td>Costa Rica</td>\n",
|
||
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
||
" <td><p>The <b><a href=\"/wiki/National_flag\" title=...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Democratic_Repub...</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_the_Demo...</td>\n",
|
||
" <td>Democratic Republic of the Congo</td>\n",
|
||
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
||
" <td><p>The <a href=\"/wiki/National_flag\" title=\"Na...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Comoros</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Comoros</td>\n",
|
||
" <td>Comoros</td>\n",
|
||
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
||
" <td><p>The <b><a href=\"/wiki/National_flag\" title=...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>201</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Antigua_and_Barbuda</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Antigua_...</td>\n",
|
||
" <td>Antigua and Barbuda</td>\n",
|
||
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
||
" <td><p>The <a href=\"/wiki/National_flag\" title=\"Na...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>202</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Angola</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Angola</td>\n",
|
||
" <td>Angola</td>\n",
|
||
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
||
" <td><p>The <a href=\"/wiki/National_flag\" title=\"Na...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>203</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Andorra</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Andorra</td>\n",
|
||
" <td>Andorra</td>\n",
|
||
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
||
" <td><p>The <a href=\"/wiki/National_flag\" title=\"Na...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>204</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Algeria</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Algeria</td>\n",
|
||
" <td>Algeria</td>\n",
|
||
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
||
" <td><p>The <a href=\"/wiki/National_flag\" title=\"Na...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>205</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Albania</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Albania</td>\n",
|
||
" <td>Albania</td>\n",
|
||
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
||
" <td><p>The <b>flag of Albania</b> (<a href=\"/wiki/...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>206 rows × 7 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" country_url \\\n",
|
||
"0 https://en.wikipedia.org/wiki/Afghanistan \n",
|
||
"1 https://en.wikipedia.org/wiki/Croatia \n",
|
||
"2 https://en.wikipedia.org/wiki/Costa_Rica \n",
|
||
"3 https://en.wikipedia.org/wiki/Democratic_Repub... \n",
|
||
"4 https://en.wikipedia.org/wiki/Comoros \n",
|
||
".. ... \n",
|
||
"201 https://en.wikipedia.org/wiki/Antigua_and_Barbuda \n",
|
||
"202 https://en.wikipedia.org/wiki/Angola \n",
|
||
"203 https://en.wikipedia.org/wiki/Andorra \n",
|
||
"204 https://en.wikipedia.org/wiki/Algeria \n",
|
||
"205 https://en.wikipedia.org/wiki/Albania \n",
|
||
"\n",
|
||
" flag_description_url \\\n",
|
||
"0 https://en.wikipedia.org/wiki/Flag_of_Afghanistan \n",
|
||
"1 https://en.wikipedia.org/wiki/Flag_of_Croatia \n",
|
||
"2 https://en.wikipedia.org/wiki/Flag_of_Costa_Rica \n",
|
||
"3 https://en.wikipedia.org/wiki/Flag_of_the_Demo... \n",
|
||
"4 https://en.wikipedia.org/wiki/Flag_of_Comoros \n",
|
||
".. ... \n",
|
||
"201 https://en.wikipedia.org/wiki/Flag_of_Antigua_... \n",
|
||
"202 https://en.wikipedia.org/wiki/Flag_of_Angola \n",
|
||
"203 https://en.wikipedia.org/wiki/Flag_of_Andorra \n",
|
||
"204 https://en.wikipedia.org/wiki/Flag_of_Algeria \n",
|
||
"205 https://en.wikipedia.org/wiki/Flag_of_Albania \n",
|
||
"\n",
|
||
" short_country_name \\\n",
|
||
"0 Afghanistan \n",
|
||
"1 Croatia \n",
|
||
"2 Costa Rica \n",
|
||
"3 Democratic Republic of the Congo \n",
|
||
"4 Comoros \n",
|
||
".. ... \n",
|
||
"201 Antigua and Barbuda \n",
|
||
"202 Angola \n",
|
||
"203 Andorra \n",
|
||
"204 Algeria \n",
|
||
"205 Albania \n",
|
||
"\n",
|
||
" country_html \\\n",
|
||
"0 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... \n",
|
||
"1 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... \n",
|
||
"2 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... \n",
|
||
"3 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... \n",
|
||
"4 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... \n",
|
||
".. ... \n",
|
||
"201 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... \n",
|
||
"202 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... \n",
|
||
"203 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... \n",
|
||
"204 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... \n",
|
||
"205 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... \n",
|
||
"\n",
|
||
" flag_html \\\n",
|
||
"0 <p>The <a href=\"/wiki/National_flag\" title=\"Na... \n",
|
||
"1 <p>The <a href=\"/wiki/National_flag\" title=\"Na... \n",
|
||
"2 <p>The <b><a href=\"/wiki/National_flag\" title=... \n",
|
||
"3 <p>The <a href=\"/wiki/National_flag\" title=\"Na... \n",
|
||
"4 <p>The <b><a href=\"/wiki/National_flag\" title=... \n",
|
||
".. ... \n",
|
||
"201 <p>The <a href=\"/wiki/National_flag\" title=\"Na... \n",
|
||
"202 <p>The <a href=\"/wiki/National_flag\" title=\"Na... \n",
|
||
"203 <p>The <a href=\"/wiki/National_flag\" title=\"Na... \n",
|
||
"204 <p>The <a href=\"/wiki/National_flag\" title=\"Na... \n",
|
||
"205 <p>The <b>flag of Albania</b> (<a href=\"/wiki/... \n",
|
||
"\n",
|
||
" file_urls \\\n",
|
||
"0 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"1 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"2 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"3 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"4 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
".. ... \n",
|
||
"201 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"202 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"203 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"204 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"205 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"\n",
|
||
" files \n",
|
||
"0 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"1 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"2 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"3 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"4 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
".. ... \n",
|
||
"201 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"202 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"203 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"204 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"205 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"\n",
|
||
"[206 rows x 7 columns]"
|
||
]
|
||
},
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"countries"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "e6bac76b-778f-4d4e-9a8b-8dc2838eb920",
|
||
"metadata": {},
|
||
"source": [
|
||
"### convert `country_html` to single string"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"id": "169123c9-aabb-4972-af5c-ba076cc21f5a",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T20:45:14.942724Z",
|
||
"iopub.status.busy": "2022-06-25T20:45:14.942372Z",
|
||
"iopub.status.idle": "2022-06-25T20:45:14.978155Z",
|
||
"shell.execute_reply": "2022-06-25T20:45:14.977401Z",
|
||
"shell.execute_reply.started": "2022-06-25T20:45:14.942694Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>country_url</th>\n",
|
||
" <th>flag_description_url</th>\n",
|
||
" <th>short_country_name</th>\n",
|
||
" <th>country_html</th>\n",
|
||
" <th>flag_html</th>\n",
|
||
" <th>file_urls</th>\n",
|
||
" <th>files</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Afghanistan</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Afghanistan</td>\n",
|
||
" <td>Afghanistan</td>\n",
|
||
" <td><tr><th colspan=\"2\" class=\"infobox-above adr\">...</td>\n",
|
||
" <td><p>The <a href=\"/wiki/National_flag\" title=\"Na...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Croatia</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Croatia</td>\n",
|
||
" <td>Croatia</td>\n",
|
||
" <td><tr><th colspan=\"2\" class=\"infobox-above adr\">...</td>\n",
|
||
" <td><p>The <a href=\"/wiki/National_flag\" title=\"Na...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Costa_Rica</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Costa_Rica</td>\n",
|
||
" <td>Costa Rica</td>\n",
|
||
" <td><tr><th colspan=\"2\" class=\"infobox-above adr\">...</td>\n",
|
||
" <td><p>The <b><a href=\"/wiki/National_flag\" title=...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Democratic_Repub...</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_the_Demo...</td>\n",
|
||
" <td>Democratic Republic of the Congo</td>\n",
|
||
" <td><tr><th colspan=\"2\" class=\"infobox-above adr\">...</td>\n",
|
||
" <td><p>The <a href=\"/wiki/National_flag\" title=\"Na...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Comoros</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Comoros</td>\n",
|
||
" <td>Comoros</td>\n",
|
||
" <td><tr><th colspan=\"2\" class=\"infobox-above adr\">...</td>\n",
|
||
" <td><p>The <b><a href=\"/wiki/National_flag\" title=...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>201</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Antigua_and_Barbuda</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Antigua_...</td>\n",
|
||
" <td>Antigua and Barbuda</td>\n",
|
||
" <td><tr><th colspan=\"2\" class=\"infobox-above adr\">...</td>\n",
|
||
" <td><p>The <a href=\"/wiki/National_flag\" title=\"Na...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>202</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Angola</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Angola</td>\n",
|
||
" <td>Angola</td>\n",
|
||
" <td><tr><th colspan=\"2\" class=\"infobox-above adr\">...</td>\n",
|
||
" <td><p>The <a href=\"/wiki/National_flag\" title=\"Na...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>203</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Andorra</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Andorra</td>\n",
|
||
" <td>Andorra</td>\n",
|
||
" <td><tr><th colspan=\"2\" class=\"infobox-above adr\">...</td>\n",
|
||
" <td><p>The <a href=\"/wiki/National_flag\" title=\"Na...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>204</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Algeria</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Algeria</td>\n",
|
||
" <td>Algeria</td>\n",
|
||
" <td><tr><th colspan=\"2\" class=\"infobox-above adr\">...</td>\n",
|
||
" <td><p>The <a href=\"/wiki/National_flag\" title=\"Na...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>205</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Albania</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Albania</td>\n",
|
||
" <td>Albania</td>\n",
|
||
" <td><tr><th colspan=\"2\" class=\"infobox-above adr\">...</td>\n",
|
||
" <td><p>The <b>flag of Albania</b> (<a href=\"/wiki/...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>206 rows × 7 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" country_url \\\n",
|
||
"0 https://en.wikipedia.org/wiki/Afghanistan \n",
|
||
"1 https://en.wikipedia.org/wiki/Croatia \n",
|
||
"2 https://en.wikipedia.org/wiki/Costa_Rica \n",
|
||
"3 https://en.wikipedia.org/wiki/Democratic_Repub... \n",
|
||
"4 https://en.wikipedia.org/wiki/Comoros \n",
|
||
".. ... \n",
|
||
"201 https://en.wikipedia.org/wiki/Antigua_and_Barbuda \n",
|
||
"202 https://en.wikipedia.org/wiki/Angola \n",
|
||
"203 https://en.wikipedia.org/wiki/Andorra \n",
|
||
"204 https://en.wikipedia.org/wiki/Algeria \n",
|
||
"205 https://en.wikipedia.org/wiki/Albania \n",
|
||
"\n",
|
||
" flag_description_url \\\n",
|
||
"0 https://en.wikipedia.org/wiki/Flag_of_Afghanistan \n",
|
||
"1 https://en.wikipedia.org/wiki/Flag_of_Croatia \n",
|
||
"2 https://en.wikipedia.org/wiki/Flag_of_Costa_Rica \n",
|
||
"3 https://en.wikipedia.org/wiki/Flag_of_the_Demo... \n",
|
||
"4 https://en.wikipedia.org/wiki/Flag_of_Comoros \n",
|
||
".. ... \n",
|
||
"201 https://en.wikipedia.org/wiki/Flag_of_Antigua_... \n",
|
||
"202 https://en.wikipedia.org/wiki/Flag_of_Angola \n",
|
||
"203 https://en.wikipedia.org/wiki/Flag_of_Andorra \n",
|
||
"204 https://en.wikipedia.org/wiki/Flag_of_Algeria \n",
|
||
"205 https://en.wikipedia.org/wiki/Flag_of_Albania \n",
|
||
"\n",
|
||
" short_country_name \\\n",
|
||
"0 Afghanistan \n",
|
||
"1 Croatia \n",
|
||
"2 Costa Rica \n",
|
||
"3 Democratic Republic of the Congo \n",
|
||
"4 Comoros \n",
|
||
".. ... \n",
|
||
"201 Antigua and Barbuda \n",
|
||
"202 Angola \n",
|
||
"203 Andorra \n",
|
||
"204 Algeria \n",
|
||
"205 Albania \n",
|
||
"\n",
|
||
" country_html \\\n",
|
||
"0 <tr><th colspan=\"2\" class=\"infobox-above adr\">... \n",
|
||
"1 <tr><th colspan=\"2\" class=\"infobox-above adr\">... \n",
|
||
"2 <tr><th colspan=\"2\" class=\"infobox-above adr\">... \n",
|
||
"3 <tr><th colspan=\"2\" class=\"infobox-above adr\">... \n",
|
||
"4 <tr><th colspan=\"2\" class=\"infobox-above adr\">... \n",
|
||
".. ... \n",
|
||
"201 <tr><th colspan=\"2\" class=\"infobox-above adr\">... \n",
|
||
"202 <tr><th colspan=\"2\" class=\"infobox-above adr\">... \n",
|
||
"203 <tr><th colspan=\"2\" class=\"infobox-above adr\">... \n",
|
||
"204 <tr><th colspan=\"2\" class=\"infobox-above adr\">... \n",
|
||
"205 <tr><th colspan=\"2\" class=\"infobox-above adr\">... \n",
|
||
"\n",
|
||
" flag_html \\\n",
|
||
"0 <p>The <a href=\"/wiki/National_flag\" title=\"Na... \n",
|
||
"1 <p>The <a href=\"/wiki/National_flag\" title=\"Na... \n",
|
||
"2 <p>The <b><a href=\"/wiki/National_flag\" title=... \n",
|
||
"3 <p>The <a href=\"/wiki/National_flag\" title=\"Na... \n",
|
||
"4 <p>The <b><a href=\"/wiki/National_flag\" title=... \n",
|
||
".. ... \n",
|
||
"201 <p>The <a href=\"/wiki/National_flag\" title=\"Na... \n",
|
||
"202 <p>The <a href=\"/wiki/National_flag\" title=\"Na... \n",
|
||
"203 <p>The <a href=\"/wiki/National_flag\" title=\"Na... \n",
|
||
"204 <p>The <a href=\"/wiki/National_flag\" title=\"Na... \n",
|
||
"205 <p>The <b>flag of Albania</b> (<a href=\"/wiki/... \n",
|
||
"\n",
|
||
" file_urls \\\n",
|
||
"0 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"1 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"2 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"3 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"4 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
".. ... \n",
|
||
"201 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"202 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"203 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"204 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"205 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"\n",
|
||
" files \n",
|
||
"0 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"1 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"2 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"3 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"4 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
".. ... \n",
|
||
"201 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"202 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"203 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"204 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"205 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"\n",
|
||
"[206 rows x 7 columns]"
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"countries[\"country_html\"] = countries[\"country_html\"].map(lambda html: \"\".join(html))\n",
|
||
"countries"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "3cf0621e-4ed8-44e4-883f-4fdece8ccbef",
|
||
"metadata": {},
|
||
"source": [
|
||
"### remove `<br>` tags"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"id": "ab6166f9-daa2-4591-9989-8d33f3f98533",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T20:45:51.019933Z",
|
||
"iopub.status.busy": "2022-06-25T20:45:51.019577Z",
|
||
"iopub.status.idle": "2022-06-25T20:45:51.041851Z",
|
||
"shell.execute_reply": "2022-06-25T20:45:51.041008Z",
|
||
"shell.execute_reply.started": "2022-06-25T20:45:51.019903Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"countries[\"country_html\"] = countries[\"country_html\"].map(\n",
|
||
" lambda html: html.replace(\"<br>\", \"\")\n",
|
||
")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "ce0e413d-1763-4520-bc36-29715963718c",
|
||
"metadata": {},
|
||
"source": [
|
||
"### add root node"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 29,
|
||
"id": "df924f80-c239-4a35-bc30-888589b34f0b",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T20:47:29.097287Z",
|
||
"iopub.status.busy": "2022-06-25T20:47:29.096936Z",
|
||
"iopub.status.idle": "2022-06-25T20:47:29.109510Z",
|
||
"shell.execute_reply": "2022-06-25T20:47:29.108689Z",
|
||
"shell.execute_reply.started": "2022-06-25T20:47:29.097258Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"countries[\"country_html\"] = countries[\"country_html\"].map(\n",
|
||
" lambda html: f\"<div>{html}</div>\"\n",
|
||
")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "f5e7249e-988c-420e-bd02-f47d96fd0685",
|
||
"metadata": {},
|
||
"source": [
|
||
"## parse\n",
|
||
"\n",
|
||
"### parse string as xml"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"id": "691bd250-44b9-4adb-9430-b0ef624c986b",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T20:47:50.180864Z",
|
||
"iopub.status.busy": "2022-06-25T20:47:50.180516Z",
|
||
"iopub.status.idle": "2022-06-25T20:47:50.184798Z",
|
||
"shell.execute_reply": "2022-06-25T20:47:50.183956Z",
|
||
"shell.execute_reply.started": "2022-06-25T20:47:50.180835Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"parser = etree.XMLParser(recover=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "9967cd92-390a-4b1e-a946-32d46a898eb9",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Afganistan"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 33,
|
||
"id": "915c338e-9225-46a3-b833-89fdeb971c88",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T20:47:51.179025Z",
|
||
"iopub.status.busy": "2022-06-25T20:47:51.178417Z",
|
||
"iopub.status.idle": "2022-06-25T20:47:51.187219Z",
|
||
"shell.execute_reply": "2022-06-25T20:47:51.186016Z",
|
||
"shell.execute_reply.started": "2022-06-25T20:47:51.178977Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# root = etree.fromstringlist(countries[\"country_html\"].iloc[0], parser)\n",
|
||
"root = etree.fromstring(countries[\"country_html\"].iloc[0], parser)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 34,
|
||
"id": "771453ab-2b00-42e5-a026-3bcea7fc6476",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T20:47:52.012467Z",
|
||
"iopub.status.busy": "2022-06-25T20:47:52.011430Z",
|
||
"iopub.status.idle": "2022-06-25T20:47:52.020811Z",
|
||
"shell.execute_reply": "2022-06-25T20:47:52.019658Z",
|
||
"shell.execute_reply.started": "2022-06-25T20:47:52.012399Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<Element div at 0x123031f80>"
|
||
]
|
||
},
|
||
"execution_count": 34,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"root"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 35,
|
||
"id": "516153ae-95b8-44b0-8bd3-51914ba95f4c",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T20:47:52.947860Z",
|
||
"iopub.status.busy": "2022-06-25T20:47:52.947172Z",
|
||
"iopub.status.idle": "2022-06-25T20:47:52.955283Z",
|
||
"shell.execute_reply": "2022-06-25T20:47:52.954038Z",
|
||
"shell.execute_reply.started": "2022-06-25T20:47:52.947809Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"b'<a href=\"/wiki/Kabul\" title=\"Kabul\">Kabul</a>'\n",
|
||
"Kabul\n",
|
||
"<Element a at 0x122f81100>\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"for element in root.xpath(\"//th[text() = 'Capital']/following-sibling::td/a\"):\n",
|
||
" print(etree.tostring(element))\n",
|
||
" print(element.text)\n",
|
||
" print(element)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "a9d0a316-fa1b-4b04-9090-ab90f72dee19",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### South Africa"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 36,
|
||
"id": "769b4492-28a0-4454-8e92-806c4e646660",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T20:47:58.319337Z",
|
||
"iopub.status.busy": "2022-06-25T20:47:58.317420Z",
|
||
"iopub.status.idle": "2022-06-25T20:47:58.339980Z",
|
||
"shell.execute_reply": "2022-06-25T20:47:58.339214Z",
|
||
"shell.execute_reply.started": "2022-06-25T20:47:58.319276Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>country_url</th>\n",
|
||
" <th>flag_description_url</th>\n",
|
||
" <th>short_country_name</th>\n",
|
||
" <th>country_html</th>\n",
|
||
" <th>flag_html</th>\n",
|
||
" <th>file_urls</th>\n",
|
||
" <th>files</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Central_African_...</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_the_Cent...</td>\n",
|
||
" <td>Central African Republic</td>\n",
|
||
" <td><div><tr><th colspan=\"2\" class=\"infobox-above ...</td>\n",
|
||
" <td><p>The <b>flag of the <a href=\"/wiki/Central_A...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>58</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/South_Africa</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_South_Af...</td>\n",
|
||
" <td>South Africa</td>\n",
|
||
" <td><div><tr><th colspan=\"2\" class=\"infobox-above ...</td>\n",
|
||
" <td><p>The <b>flag of South Africa</b> was designe...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" country_url \\\n",
|
||
"9 https://en.wikipedia.org/wiki/Central_African_... \n",
|
||
"58 https://en.wikipedia.org/wiki/South_Africa \n",
|
||
"\n",
|
||
" flag_description_url \\\n",
|
||
"9 https://en.wikipedia.org/wiki/Flag_of_the_Cent... \n",
|
||
"58 https://en.wikipedia.org/wiki/Flag_of_South_Af... \n",
|
||
"\n",
|
||
" short_country_name \\\n",
|
||
"9 Central African Republic \n",
|
||
"58 South Africa \n",
|
||
"\n",
|
||
" country_html \\\n",
|
||
"9 <div><tr><th colspan=\"2\" class=\"infobox-above ... \n",
|
||
"58 <div><tr><th colspan=\"2\" class=\"infobox-above ... \n",
|
||
"\n",
|
||
" flag_html \\\n",
|
||
"9 <p>The <b>flag of the <a href=\"/wiki/Central_A... \n",
|
||
"58 <p>The <b>flag of South Africa</b> was designe... \n",
|
||
"\n",
|
||
" file_urls \\\n",
|
||
"9 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"58 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"\n",
|
||
" files \n",
|
||
"9 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"58 [{'url': 'https://upload.wikimedia.org/wikiped... "
|
||
]
|
||
},
|
||
"execution_count": 36,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"countries[countries[\"short_country_name\"].map(lambda country: \"Africa\" in country)]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 60,
|
||
"id": "f06833fe-dee5-42dc-b0ad-71f9759e0fb1",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T21:21:03.803935Z",
|
||
"iopub.status.busy": "2022-06-25T21:21:03.803558Z",
|
||
"iopub.status.idle": "2022-06-25T21:21:03.809310Z",
|
||
"shell.execute_reply": "2022-06-25T21:21:03.808433Z",
|
||
"shell.execute_reply.started": "2022-06-25T21:21:03.803906Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# root = etree.fromstringlist(countries[\"country_html\"].iloc[58], parser)\n",
|
||
"root = etree.fromstring(countries[\"country_html\"].iloc[58], parser)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 69,
|
||
"id": "70f25f0c-92bb-43ee-9223-15d1ad9fb691",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T21:22:53.884677Z",
|
||
"iopub.status.busy": "2022-06-25T21:22:53.884343Z",
|
||
"iopub.status.idle": "2022-06-25T21:22:53.893752Z",
|
||
"shell.execute_reply": "2022-06-25T21:22:53.893051Z",
|
||
"shell.execute_reply.started": "2022-06-25T21:22:53.884650Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"for element in root.xpath(\"//th[text() = 'Capital']/following-sibling::td\"):\n",
|
||
" for capital in element:\n",
|
||
" capital_filter = capital.xpath(\"ul//a\")\n",
|
||
" branch_filter = capital.xpath(\"ul//li\")\n",
|
||
"\n",
|
||
" _capitals = []\n",
|
||
" for item in capital_filter:\n",
|
||
" if not isinstance(re.search(r\"\\d\", item.text), re.Match):\n",
|
||
" _capitals.append(item.text)\n",
|
||
" for match in branch_filter:\n",
|
||
" branch = match.xpath(\"text()\")\n",
|
||
" _capitals.append(re.search(r\"(?:\\()([^\\)]*)\", branch[0])[1])\n",
|
||
"\n",
|
||
" result = {\n",
|
||
" \"index\": 58,\n",
|
||
" \"country_name\": countries[\"short_country_name\"].iloc[58],\n",
|
||
" \"capital\": _capitals,\n",
|
||
" }"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 70,
|
||
"id": "4d6fffaa-c08f-4ae9-b1d4-d1d27550a903",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T21:22:57.761492Z",
|
||
"iopub.status.busy": "2022-06-25T21:22:57.761115Z",
|
||
"iopub.status.idle": "2022-06-25T21:22:57.767381Z",
|
||
"shell.execute_reply": "2022-06-25T21:22:57.766523Z",
|
||
"shell.execute_reply.started": "2022-06-25T21:22:57.761462Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"{'index': 58,\n",
|
||
" 'country_name': 'South Africa',\n",
|
||
" 'capital': ['Pretoria',\n",
|
||
" 'Cape Town',\n",
|
||
" 'Bloemfontein',\n",
|
||
" 'executive',\n",
|
||
" 'legislative',\n",
|
||
" 'judicial']}"
|
||
]
|
||
},
|
||
"execution_count": 70,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"result"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "8af8a656-3e07-4de0-933e-012d218c18f9",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Albania"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 41,
|
||
"id": "33205ffb-1b50-468b-8d5b-3c7302a8b285",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T20:48:07.893538Z",
|
||
"iopub.status.busy": "2022-06-25T20:48:07.892909Z",
|
||
"iopub.status.idle": "2022-06-25T20:48:07.905915Z",
|
||
"shell.execute_reply": "2022-06-25T20:48:07.905138Z",
|
||
"shell.execute_reply.started": "2022-06-25T20:48:07.893507Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>country_url</th>\n",
|
||
" <th>flag_description_url</th>\n",
|
||
" <th>short_country_name</th>\n",
|
||
" <th>country_html</th>\n",
|
||
" <th>flag_html</th>\n",
|
||
" <th>file_urls</th>\n",
|
||
" <th>files</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>205</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Albania</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Albania</td>\n",
|
||
" <td>Albania</td>\n",
|
||
" <td><div><tr><th colspan=\"2\" class=\"infobox-above ...</td>\n",
|
||
" <td><p>The <b>flag of Albania</b> (<a href=\"/wiki/...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" country_url \\\n",
|
||
"205 https://en.wikipedia.org/wiki/Albania \n",
|
||
"\n",
|
||
" flag_description_url short_country_name \\\n",
|
||
"205 https://en.wikipedia.org/wiki/Flag_of_Albania Albania \n",
|
||
"\n",
|
||
" country_html \\\n",
|
||
"205 <div><tr><th colspan=\"2\" class=\"infobox-above ... \n",
|
||
"\n",
|
||
" flag_html \\\n",
|
||
"205 <p>The <b>flag of Albania</b> (<a href=\"/wiki/... \n",
|
||
"\n",
|
||
" file_urls \\\n",
|
||
"205 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"\n",
|
||
" files \n",
|
||
"205 [{'url': 'https://upload.wikimedia.org/wikiped... "
|
||
]
|
||
},
|
||
"execution_count": 41,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"countries[countries[\"short_country_name\"].map(lambda country: \"Albania\" in country)]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 43,
|
||
"id": "482c1bae-c55c-43dd-b603-a297140fe428",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T20:48:15.103731Z",
|
||
"iopub.status.busy": "2022-06-25T20:48:15.103380Z",
|
||
"iopub.status.idle": "2022-06-25T20:48:15.108883Z",
|
||
"shell.execute_reply": "2022-06-25T20:48:15.108029Z",
|
||
"shell.execute_reply.started": "2022-06-25T20:48:15.103703Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"root = etree.fromstring(countries[\"country_html\"].iloc[205], parser)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 45,
|
||
"id": "79966a1a-3017-40aa-a639-889770b68419",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T20:48:17.351871Z",
|
||
"iopub.status.busy": "2022-06-25T20:48:17.350564Z",
|
||
"iopub.status.idle": "2022-06-25T20:48:17.360415Z",
|
||
"shell.execute_reply": "2022-06-25T20:48:17.358988Z",
|
||
"shell.execute_reply.started": "2022-06-25T20:48:17.351827Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<Element td at 0x123fc9840>\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"for element in root.xpath(\"//th[text() = 'Capital']/following-sibling::td\"):\n",
|
||
" print(element)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "5c316764-016f-49be-a3ad-1485dceb5b0c",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Sahrawi Arab Democratic Republic"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 80,
|
||
"id": "3dfd7e29-6166-483a-b454-09daadf2ea20",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T21:26:21.907262Z",
|
||
"iopub.status.busy": "2022-06-25T21:26:21.906430Z",
|
||
"iopub.status.idle": "2022-06-25T21:26:21.924749Z",
|
||
"shell.execute_reply": "2022-06-25T21:26:21.923775Z",
|
||
"shell.execute_reply.started": "2022-06-25T21:26:21.907228Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>country_url</th>\n",
|
||
" <th>flag_description_url</th>\n",
|
||
" <th>short_country_name</th>\n",
|
||
" <th>country_html</th>\n",
|
||
" <th>flag_html</th>\n",
|
||
" <th>file_urls</th>\n",
|
||
" <th>files</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Sahrawi_Arab_Dem...</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Sahrawi_...</td>\n",
|
||
" <td>Sahrawi Arab Democratic Republic</td>\n",
|
||
" <td><div><tr><th colspan=\"2\" class=\"infobox-above ...</td>\n",
|
||
" <td><p>The <b>flag of Western Sahara</b> (Arabic: ...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>39</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/United_Arab_Emir...</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_the_Unit...</td>\n",
|
||
" <td>United Arab Emirates</td>\n",
|
||
" <td><div><tr><th colspan=\"2\" class=\"infobox-above ...</td>\n",
|
||
" <td><p>The <a href=\"/wiki/National_flag\" title=\"Na...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>69</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Saudi_Arabia</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Saudi_Ar...</td>\n",
|
||
" <td>Saudi Arabia</td>\n",
|
||
" <td><div><tr><th colspan=\"2\" class=\"infobox-above ...</td>\n",
|
||
" <td><p>The <b>flag of the Kingdom of Saudi Arabia<...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" country_url \\\n",
|
||
"20 https://en.wikipedia.org/wiki/Sahrawi_Arab_Dem... \n",
|
||
"39 https://en.wikipedia.org/wiki/United_Arab_Emir... \n",
|
||
"69 https://en.wikipedia.org/wiki/Saudi_Arabia \n",
|
||
"\n",
|
||
" flag_description_url \\\n",
|
||
"20 https://en.wikipedia.org/wiki/Flag_of_Sahrawi_... \n",
|
||
"39 https://en.wikipedia.org/wiki/Flag_of_the_Unit... \n",
|
||
"69 https://en.wikipedia.org/wiki/Flag_of_Saudi_Ar... \n",
|
||
"\n",
|
||
" short_country_name \\\n",
|
||
"20 Sahrawi Arab Democratic Republic \n",
|
||
"39 United Arab Emirates \n",
|
||
"69 Saudi Arabia \n",
|
||
"\n",
|
||
" country_html \\\n",
|
||
"20 <div><tr><th colspan=\"2\" class=\"infobox-above ... \n",
|
||
"39 <div><tr><th colspan=\"2\" class=\"infobox-above ... \n",
|
||
"69 <div><tr><th colspan=\"2\" class=\"infobox-above ... \n",
|
||
"\n",
|
||
" flag_html \\\n",
|
||
"20 <p>The <b>flag of Western Sahara</b> (Arabic: ... \n",
|
||
"39 <p>The <a href=\"/wiki/National_flag\" title=\"Na... \n",
|
||
"69 <p>The <b>flag of the Kingdom of Saudi Arabia<... \n",
|
||
"\n",
|
||
" file_urls \\\n",
|
||
"20 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"39 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"69 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"\n",
|
||
" files \n",
|
||
"20 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"39 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||
"69 [{'url': 'https://upload.wikimedia.org/wikiped... "
|
||
]
|
||
},
|
||
"execution_count": 80,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"countries[\n",
|
||
" countries[\"short_country_name\"].map(lambda country: \"arab\" in country.lower())\n",
|
||
"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 81,
|
||
"id": "02e1f53a-9081-4edc-9a61-41da56166274",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T21:26:26.463033Z",
|
||
"iopub.status.busy": "2022-06-25T21:26:26.462631Z",
|
||
"iopub.status.idle": "2022-06-25T21:26:26.469880Z",
|
||
"shell.execute_reply": "2022-06-25T21:26:26.469125Z",
|
||
"shell.execute_reply.started": "2022-06-25T21:26:26.463004Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"country_url https://en.wikipedia.org/wiki/Sahrawi_Arab_Dem...\n",
|
||
"flag_description_url https://en.wikipedia.org/wiki/Flag_of_Sahrawi_...\n",
|
||
"short_country_name Sahrawi Arab Democratic Republic\n",
|
||
"country_html <div><tr><th colspan=\"2\" class=\"infobox-above ...\n",
|
||
"flag_html <p>The <b>flag of Western Sahara</b> (Arabic: ...\n",
|
||
"file_urls [https:////upload.wikimedia.org/wikipedia/comm...\n",
|
||
"files [{'url': 'https://upload.wikimedia.org/wikiped...\n",
|
||
"Name: 20, dtype: object"
|
||
]
|
||
},
|
||
"execution_count": 81,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"countries.iloc[20]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 82,
|
||
"id": "3b74a848-af64-4ea8-b2b2-3b78c9fb843e",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T21:26:39.089757Z",
|
||
"iopub.status.busy": "2022-06-25T21:26:39.089324Z",
|
||
"iopub.status.idle": "2022-06-25T21:26:39.095857Z",
|
||
"shell.execute_reply": "2022-06-25T21:26:39.094902Z",
|
||
"shell.execute_reply.started": "2022-06-25T21:26:39.089723Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"root = etree.fromstring(countries[\"country_html\"].iloc[20], parser)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 114,
|
||
"id": "f8829405-0533-4349-9c99-05e9e1e41607",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T21:49:34.812182Z",
|
||
"iopub.status.busy": "2022-06-25T21:49:34.811795Z",
|
||
"iopub.status.idle": "2022-06-25T21:49:34.820550Z",
|
||
"shell.execute_reply": "2022-06-25T21:49:34.819761Z",
|
||
"shell.execute_reply.started": "2022-06-25T21:49:34.812153Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"b'<div class=\"plainlist\"><ul><li><a href=\"/wiki/Laayoune\" title=\"Laayoune\">El Aaiún</a><sup>a</sup> (<i>de jure</i>)</li><li><span class=\"nowrap\"><a href=\"/wiki/Tifariti\" title=\"Tifariti\">Tifariti</a> (<i>de facto</i>)</span></li></ul></div>'\n",
|
||
"b'<i>de jure</i>)'\n",
|
||
"match de jure\n",
|
||
"b'<i>de facto</i>)'\n",
|
||
"match de facto\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"for element in root.xpath(\"//th[text() = 'Capital']/following-sibling::td\"):\n",
|
||
" for capital in element:\n",
|
||
" print(etree.tostring(capital))\n",
|
||
" capital_filter = capital.xpath(\"ul//a\")\n",
|
||
" branch_filter = capital.xpath(\"ul//li//i\")\n",
|
||
"\n",
|
||
" _capitals = []\n",
|
||
" for item in capital_filter:\n",
|
||
" if not isinstance(re.search(r\"\\d\", item.text), re.Match):\n",
|
||
" _capitals.append(item.text)\n",
|
||
" for match in branch_filter:\n",
|
||
" print(etree.tostring(match))\n",
|
||
" print(\"match\", match.text)\n",
|
||
" if isinstance(\n",
|
||
" re.search(r\"(?:i\\>)([^\\<)]*)\", etree.tostring(match).decode(\"utf-8\")),\n",
|
||
" re.Match,\n",
|
||
" ):\n",
|
||
" _capitals.append(match.text)\n",
|
||
" result = {\n",
|
||
" \"index\": 20,\n",
|
||
" \"country_name\": countries[\"short_country_name\"].iloc[20],\n",
|
||
" \"capital\": _capitals,\n",
|
||
" }"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 115,
|
||
"id": "92d7c4ac-44f9-46e6-b8ae-94dffa276f84",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T21:49:35.379470Z",
|
||
"iopub.status.busy": "2022-06-25T21:49:35.378687Z",
|
||
"iopub.status.idle": "2022-06-25T21:49:35.388295Z",
|
||
"shell.execute_reply": "2022-06-25T21:49:35.386602Z",
|
||
"shell.execute_reply.started": "2022-06-25T21:49:35.379399Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"{'index': 20,\n",
|
||
" 'country_name': 'Sahrawi Arab Democratic Republic',\n",
|
||
" 'capital': ['El Aaiún', 'Tifariti', 'de jure', 'de facto']}"
|
||
]
|
||
},
|
||
"execution_count": 115,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"result"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "7dbe415a-ba82-4813-9723-ac66ec9b29aa",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### State of Palestine"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 212,
|
||
"id": "b39a6451-19e0-4ec6-a925-89bcdb89c441",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T23:47:35.380713Z",
|
||
"iopub.status.busy": "2022-06-25T23:47:35.380097Z",
|
||
"iopub.status.idle": "2022-06-25T23:47:35.485628Z",
|
||
"shell.execute_reply": "2022-06-25T23:47:35.484621Z",
|
||
"shell.execute_reply.started": "2022-06-25T23:47:35.380654Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>country_url</th>\n",
|
||
" <th>flag_description_url</th>\n",
|
||
" <th>short_country_name</th>\n",
|
||
" <th>country_html</th>\n",
|
||
" <th>flag_html</th>\n",
|
||
" <th>file_urls</th>\n",
|
||
" <th>files</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>87</th>\n",
|
||
" <td>https://en.wikipedia.org/wiki/State_of_Palestine</td>\n",
|
||
" <td>https://en.wikipedia.org/wiki/Flag_of_Palestine</td>\n",
|
||
" <td>State of Palestine</td>\n",
|
||
" <td><div><tr><th colspan=\"2\" class=\"infobox-above ...</td>\n",
|
||
" <td><p>The <b>flag of Palestine</b> (<a href=\"/wik...</td>\n",
|
||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" country_url \\\n",
|
||
"87 https://en.wikipedia.org/wiki/State_of_Palestine \n",
|
||
"\n",
|
||
" flag_description_url short_country_name \\\n",
|
||
"87 https://en.wikipedia.org/wiki/Flag_of_Palestine State of Palestine \n",
|
||
"\n",
|
||
" country_html \\\n",
|
||
"87 <div><tr><th colspan=\"2\" class=\"infobox-above ... \n",
|
||
"\n",
|
||
" flag_html \\\n",
|
||
"87 <p>The <b>flag of Palestine</b> (<a href=\"/wik... \n",
|
||
"\n",
|
||
" file_urls \\\n",
|
||
"87 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||
"\n",
|
||
" files \n",
|
||
"87 [{'url': 'https://upload.wikimedia.org/wikiped... "
|
||
]
|
||
},
|
||
"execution_count": 212,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"countries[\n",
|
||
" countries[\"short_country_name\"].map(lambda country: \"palestine\" in country.lower())\n",
|
||
"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 213,
|
||
"id": "ab895aca-9ffe-4dcf-beb1-4ab9d314c82b",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-25T23:47:39.784680Z",
|
||
"iopub.status.busy": "2022-06-25T23:47:39.784136Z",
|
||
"iopub.status.idle": "2022-06-25T23:47:39.791893Z",
|
||
"shell.execute_reply": "2022-06-25T23:47:39.790751Z",
|
||
"shell.execute_reply.started": "2022-06-25T23:47:39.784625Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"root = etree.fromstring(countries[\"country_html\"].iloc[87], parser)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 262,
|
||
"id": "e78c8eb9-a7d3-4617-b4fa-0ef1afd7bf29",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-26T00:00:41.346966Z",
|
||
"iopub.status.busy": "2022-06-26T00:00:41.346256Z",
|
||
"iopub.status.idle": "2022-06-26T00:00:41.353815Z",
|
||
"shell.execute_reply": "2022-06-26T00:00:41.352825Z",
|
||
"shell.execute_reply.started": "2022-06-26T00:00:41.346905Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Jerusalem\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"for element in root.xpath(\n",
|
||
" \"//th/div/ul/li[text() = 'Proclaimed capital']/following::td[1]//li[1]/a[1]\"\n",
|
||
"):\n",
|
||
" print(element.text)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "df8d24f4-0a07-46fc-920d-21cb0d6472e6",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "08e1bea0-6f04-4c61-98fb-90abd4750a94",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "9d909d88-e086-41fe-a46f-1cd0a21ec625",
|
||
"metadata": {},
|
||
"source": [
|
||
"## define functions"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 271,
|
||
"id": "9263180c-92ed-4f5f-a479-70af90599b75",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-26T00:05:59.447161Z",
|
||
"iopub.status.busy": "2022-06-26T00:05:59.446696Z",
|
||
"iopub.status.idle": "2022-06-26T00:05:59.469995Z",
|
||
"shell.execute_reply": "2022-06-26T00:05:59.468232Z",
|
||
"shell.execute_reply.started": "2022-06-26T00:05:59.447116Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"def extract_capital_0(index: int, country_name: str, country_html: str):\n",
|
||
" result = None\n",
|
||
" root = etree.fromstring(country_html, parser)\n",
|
||
"\n",
|
||
" # matches single capital\n",
|
||
" for element in root.xpath(\"//th[text() = 'Capital']/following-sibling::td/a\"):\n",
|
||
" result = {\"index\": index, \"country_name\": country_name, \"capital\": element.text}\n",
|
||
"\n",
|
||
" # matches multiple capitals\n",
|
||
" if result is None:\n",
|
||
" for element in root.xpath(\"//th[text() = 'Capital']/following-sibling::td\"):\n",
|
||
" capital_filter = element.xpath(\"div//a\")\n",
|
||
" branch_filter = element.xpath(\"div//ul/li/text()\")\n",
|
||
" _capitals = []\n",
|
||
" for root in capital_filter:\n",
|
||
" if root.text is not None:\n",
|
||
" _capitals.append(root.text)\n",
|
||
" for root in branch_filter:\n",
|
||
" if isinstance(\n",
|
||
" res := re.search(r\"(?:\\()([^/)]*)\", str(root).strip()), re.Match\n",
|
||
" ):\n",
|
||
" _capitals.append(res[1])\n",
|
||
" result = {\n",
|
||
" \"index\": index,\n",
|
||
" \"country_name\": country_name,\n",
|
||
" \"capital\": _capitals,\n",
|
||
" }\n",
|
||
"\n",
|
||
" # matches mutiple capitals with italic footnote\n",
|
||
" if result is None:\n",
|
||
" for element in root.xpath(\"//th[text() = 'Capital']/following-sibling::td\"):\n",
|
||
" for capital in element:\n",
|
||
" capital_filter = capital.xpath(\"ul//a\")\n",
|
||
" branch_filter = capital.xpath(\"ul//li//i\")\n",
|
||
"\n",
|
||
" _capitals = []\n",
|
||
" for item in capital_filter:\n",
|
||
" if not isinstance(re.search(r\"\\d\", item.text), re.Match):\n",
|
||
" _capitals.append(item.text)\n",
|
||
" for match in branch_filter:\n",
|
||
" if isinstance(\n",
|
||
" re.search(\n",
|
||
" r\"(?:i\\>)([^\\<)]*)\", etree.tostring(match).decode(\"utf-8\")\n",
|
||
" ),\n",
|
||
" re.Match,\n",
|
||
" ):\n",
|
||
" _capitals.append(match.text)\n",
|
||
" result = {\n",
|
||
" \"index\": index,\n",
|
||
" \"country_name\": country_name,\n",
|
||
" \"capital\": _capitals,\n",
|
||
" }\n",
|
||
"\n",
|
||
" # proclaimed capitals: e.g Palestine\n",
|
||
" if result is None:\n",
|
||
" for element in root.xpath(\n",
|
||
" \"//th/div/ul/li[text() = 'Proclaimed capital']/following::td[1]//li[1]/a[1]\"\n",
|
||
" ):\n",
|
||
" result = {\n",
|
||
" \"index\": index,\n",
|
||
" \"country_name\": country_name,\n",
|
||
" \"capital\": element.text,\n",
|
||
" }\n",
|
||
"\n",
|
||
" return result or None"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 273,
|
||
"id": "0b39d1c7-a1ee-4031-a0eb-93ab5ef525ff",
|
||
"metadata": {
|
||
"execution": {
|
||
"iopub.execute_input": "2022-06-26T00:06:10.855929Z",
|
||
"iopub.status.busy": "2022-06-26T00:06:10.855004Z",
|
||
"iopub.status.idle": "2022-06-26T00:06:11.012760Z",
|
||
"shell.execute_reply": "2022-06-26T00:06:11.011838Z",
|
||
"shell.execute_reply.started": "2022-06-26T00:06:10.855859Z"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"{'index': 23, 'country_name': 'Kosovo', 'capital': []}\n",
|
||
"33 Vatican City\n",
|
||
"64 Singapore\n",
|
||
"{'index': 201, 'country_name': 'Antigua and Barbuda', 'capital': []}\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"for index, country_name, country_html in zip(\n",
|
||
" countries.index, countries[\"short_country_name\"], countries[\"country_html\"]\n",
|
||
"):\n",
|
||
" result = extract_capital_0(index, country_name, country_html)\n",
|
||
" try:\n",
|
||
" if len(result[\"capital\"]) == 0:\n",
|
||
" print(result)\n",
|
||
" except TypeError:\n",
|
||
" print(index, country_name)\n",
|
||
"\n",
|
||
" # print(json.dumps(extract_capital_0(index, country_name, country_html)))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "8c147ae2-cc6f-4ac9-9c6a-81fb8e71d1d1",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.8.12"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|