{ "cells": [ { "cell_type": "markdown", "id": "a1e7252d-16d9-4b68-a855-d94a89132291", "metadata": {}, "source": [ "# Capital extraction" ] }, { "cell_type": "code", "execution_count": 1, "id": "e4bb164b-f8a7-4d21-86a0-618ca78e9386", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T20:44:25.083012Z", "iopub.status.busy": "2022-06-25T20:44:25.082584Z", "iopub.status.idle": "2022-06-25T20:44:25.400728Z", "shell.execute_reply": "2022-06-25T20:44:25.399967Z", "shell.execute_reply.started": "2022-06-25T20:44:25.082926Z" }, "tags": [] }, "outputs": [], "source": [ "import json\n", "import pathlib\n", "import re\n", "import xml.etree.ElementTree as ET\n", "\n", "import pandas as pd\n", "from lxml import etree" ] }, { "cell_type": "markdown", "id": "9fd69c5d-2c4d-49c8-a042-82eade1d6ab7", "metadata": {}, "source": [ "## load data\n", "\n", "### load the raw countries data" ] }, { "cell_type": "code", "execution_count": 2, "id": "bfe405bb-7879-4f5f-85df-7215c5e8a4b8", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T20:44:25.402295Z", "iopub.status.busy": "2022-06-25T20:44:25.401710Z", "iopub.status.idle": "2022-06-25T20:44:25.406212Z", "shell.execute_reply": "2022-06-25T20:44:25.405622Z", "shell.execute_reply.started": "2022-06-25T20:44:25.402274Z" }, "tags": [] }, "outputs": [], "source": [ "data_directory = (\n", " pathlib.Path(\".\").resolve().parents[1] / \"data\" / \"scrapy\" / \"raw_country_data\"\n", ")" ] }, { "cell_type": "code", "execution_count": 12, "id": "49fdb7c8-b9db-4d0a-8d2b-d4a0a3ddcdd9", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T20:45:11.573098Z", "iopub.status.busy": "2022-06-25T20:45:11.572720Z", "iopub.status.idle": "2022-06-25T20:45:11.650803Z", "shell.execute_reply": "2022-06-25T20:45:11.650139Z", "shell.execute_reply.started": "2022-06-25T20:45:11.573067Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
0https://en.wikipedia.org/wiki/Afghanistanhttps://en.wikipedia.org/wiki/Flag_of_AfghanistanAfghanistan[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
1https://en.wikipedia.org/wiki/Croatiahttps://en.wikipedia.org/wiki/Flag_of_CroatiaCroatia[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
2https://en.wikipedia.org/wiki/Costa_Ricahttps://en.wikipedia.org/wiki/Flag_of_Costa_RicaCosta_Rica[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <b><a href=\"/wiki/National_flag\" title=...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
3https://en.wikipedia.org/wiki/Democratic_Repub...https://en.wikipedia.org/wiki/Flag_of_the_Demo...Democratic_Republic_of_the_Congo[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
4https://en.wikipedia.org/wiki/Comoroshttps://en.wikipedia.org/wiki/Flag_of_ComorosComoros[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <b><a href=\"/wiki/National_flag\" title=...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
........................
201https://en.wikipedia.org/wiki/Antigua_and_Barbudahttps://en.wikipedia.org/wiki/Flag_of_Antigua_...Antigua_and_Barbuda[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
202https://en.wikipedia.org/wiki/Angolahttps://en.wikipedia.org/wiki/Flag_of_AngolaAngola[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
203https://en.wikipedia.org/wiki/Andorrahttps://en.wikipedia.org/wiki/Flag_of_AndorraAndorra[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
204https://en.wikipedia.org/wiki/Algeriahttps://en.wikipedia.org/wiki/Flag_of_AlgeriaAlgeria[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
205https://en.wikipedia.org/wiki/Albaniahttps://en.wikipedia.org/wiki/Flag_of_AlbaniaAlbania[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <b>flag of Albania</b> (<a href=\"/wiki/...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
\n", "

206 rows × 7 columns

\n", "
" ], "text/plain": [ " country_url \\\n", "0 https://en.wikipedia.org/wiki/Afghanistan \n", "1 https://en.wikipedia.org/wiki/Croatia \n", "2 https://en.wikipedia.org/wiki/Costa_Rica \n", "3 https://en.wikipedia.org/wiki/Democratic_Repub... \n", "4 https://en.wikipedia.org/wiki/Comoros \n", ".. ... \n", "201 https://en.wikipedia.org/wiki/Antigua_and_Barbuda \n", "202 https://en.wikipedia.org/wiki/Angola \n", "203 https://en.wikipedia.org/wiki/Andorra \n", "204 https://en.wikipedia.org/wiki/Algeria \n", "205 https://en.wikipedia.org/wiki/Albania \n", "\n", " flag_description_url \\\n", "0 https://en.wikipedia.org/wiki/Flag_of_Afghanistan \n", "1 https://en.wikipedia.org/wiki/Flag_of_Croatia \n", "2 https://en.wikipedia.org/wiki/Flag_of_Costa_Rica \n", "3 https://en.wikipedia.org/wiki/Flag_of_the_Demo... \n", "4 https://en.wikipedia.org/wiki/Flag_of_Comoros \n", ".. ... \n", "201 https://en.wikipedia.org/wiki/Flag_of_Antigua_... \n", "202 https://en.wikipedia.org/wiki/Flag_of_Angola \n", "203 https://en.wikipedia.org/wiki/Flag_of_Andorra \n", "204 https://en.wikipedia.org/wiki/Flag_of_Algeria \n", "205 https://en.wikipedia.org/wiki/Flag_of_Albania \n", "\n", " short_country_name \\\n", "0 Afghanistan \n", "1 Croatia \n", "2 Costa_Rica \n", "3 Democratic_Republic_of_the_Congo \n", "4 Comoros \n", ".. ... \n", "201 Antigua_and_Barbuda \n", "202 Angola \n", "203 Andorra \n", "204 Algeria \n", "205 Albania \n", "\n", " country_html \\\n", "0 [The The The The The The The The The The flag of Albania (\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
0https://en.wikipedia.org/wiki/Afghanistanhttps://en.wikipedia.org/wiki/Flag_of_AfghanistanAfghanistan[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
1https://en.wikipedia.org/wiki/Croatiahttps://en.wikipedia.org/wiki/Flag_of_CroatiaCroatia[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
2https://en.wikipedia.org/wiki/Costa_Ricahttps://en.wikipedia.org/wiki/Flag_of_Costa_RicaCosta Rica[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <b><a href=\"/wiki/National_flag\" title=...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
3https://en.wikipedia.org/wiki/Democratic_Repub...https://en.wikipedia.org/wiki/Flag_of_the_Demo...Democratic Republic of the Congo[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
4https://en.wikipedia.org/wiki/Comoroshttps://en.wikipedia.org/wiki/Flag_of_ComorosComoros[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <b><a href=\"/wiki/National_flag\" title=...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
........................
201https://en.wikipedia.org/wiki/Antigua_and_Barbudahttps://en.wikipedia.org/wiki/Flag_of_Antigua_...Antigua and Barbuda[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
202https://en.wikipedia.org/wiki/Angolahttps://en.wikipedia.org/wiki/Flag_of_AngolaAngola[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
203https://en.wikipedia.org/wiki/Andorrahttps://en.wikipedia.org/wiki/Flag_of_AndorraAndorra[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
204https://en.wikipedia.org/wiki/Algeriahttps://en.wikipedia.org/wiki/Flag_of_AlgeriaAlgeria[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
205https://en.wikipedia.org/wiki/Albaniahttps://en.wikipedia.org/wiki/Flag_of_AlbaniaAlbania[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <b>flag of Albania</b> (<a href=\"/wiki/...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
\n", "

206 rows × 7 columns

\n", "" ], "text/plain": [ " country_url \\\n", "0 https://en.wikipedia.org/wiki/Afghanistan \n", "1 https://en.wikipedia.org/wiki/Croatia \n", "2 https://en.wikipedia.org/wiki/Costa_Rica \n", "3 https://en.wikipedia.org/wiki/Democratic_Repub... \n", "4 https://en.wikipedia.org/wiki/Comoros \n", ".. ... \n", "201 https://en.wikipedia.org/wiki/Antigua_and_Barbuda \n", "202 https://en.wikipedia.org/wiki/Angola \n", "203 https://en.wikipedia.org/wiki/Andorra \n", "204 https://en.wikipedia.org/wiki/Algeria \n", "205 https://en.wikipedia.org/wiki/Albania \n", "\n", " flag_description_url \\\n", "0 https://en.wikipedia.org/wiki/Flag_of_Afghanistan \n", "1 https://en.wikipedia.org/wiki/Flag_of_Croatia \n", "2 https://en.wikipedia.org/wiki/Flag_of_Costa_Rica \n", "3 https://en.wikipedia.org/wiki/Flag_of_the_Demo... \n", "4 https://en.wikipedia.org/wiki/Flag_of_Comoros \n", ".. ... \n", "201 https://en.wikipedia.org/wiki/Flag_of_Antigua_... \n", "202 https://en.wikipedia.org/wiki/Flag_of_Angola \n", "203 https://en.wikipedia.org/wiki/Flag_of_Andorra \n", "204 https://en.wikipedia.org/wiki/Flag_of_Algeria \n", "205 https://en.wikipedia.org/wiki/Flag_of_Albania \n", "\n", " short_country_name \\\n", "0 Afghanistan \n", "1 Croatia \n", "2 Costa Rica \n", "3 Democratic Republic of the Congo \n", "4 Comoros \n", ".. ... \n", "201 Antigua and Barbuda \n", "202 Angola \n", "203 Andorra \n", "204 Algeria \n", "205 Albania \n", "\n", " country_html \\\n", "0 [The
The The The The The The The The The flag of Albania (\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
0https://en.wikipedia.org/wiki/Afghanistanhttps://en.wikipedia.org/wiki/Flag_of_AfghanistanAfghanistan<tr><th colspan=\"2\" class=\"infobox-above adr\">...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
1https://en.wikipedia.org/wiki/Croatiahttps://en.wikipedia.org/wiki/Flag_of_CroatiaCroatia<tr><th colspan=\"2\" class=\"infobox-above adr\">...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
2https://en.wikipedia.org/wiki/Costa_Ricahttps://en.wikipedia.org/wiki/Flag_of_Costa_RicaCosta Rica<tr><th colspan=\"2\" class=\"infobox-above adr\">...<p>The <b><a href=\"/wiki/National_flag\" title=...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
3https://en.wikipedia.org/wiki/Democratic_Repub...https://en.wikipedia.org/wiki/Flag_of_the_Demo...Democratic Republic of the Congo<tr><th colspan=\"2\" class=\"infobox-above adr\">...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
4https://en.wikipedia.org/wiki/Comoroshttps://en.wikipedia.org/wiki/Flag_of_ComorosComoros<tr><th colspan=\"2\" class=\"infobox-above adr\">...<p>The <b><a href=\"/wiki/National_flag\" title=...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
........................
201https://en.wikipedia.org/wiki/Antigua_and_Barbudahttps://en.wikipedia.org/wiki/Flag_of_Antigua_...Antigua and Barbuda<tr><th colspan=\"2\" class=\"infobox-above adr\">...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
202https://en.wikipedia.org/wiki/Angolahttps://en.wikipedia.org/wiki/Flag_of_AngolaAngola<tr><th colspan=\"2\" class=\"infobox-above adr\">...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
203https://en.wikipedia.org/wiki/Andorrahttps://en.wikipedia.org/wiki/Flag_of_AndorraAndorra<tr><th colspan=\"2\" class=\"infobox-above adr\">...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
204https://en.wikipedia.org/wiki/Algeriahttps://en.wikipedia.org/wiki/Flag_of_AlgeriaAlgeria<tr><th colspan=\"2\" class=\"infobox-above adr\">...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
205https://en.wikipedia.org/wiki/Albaniahttps://en.wikipedia.org/wiki/Flag_of_AlbaniaAlbania<tr><th colspan=\"2\" class=\"infobox-above adr\">...<p>The <b>flag of Albania</b> (<a href=\"/wiki/...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
\n", "

206 rows × 7 columns

\n", "" ], "text/plain": [ " country_url \\\n", "0 https://en.wikipedia.org/wiki/Afghanistan \n", "1 https://en.wikipedia.org/wiki/Croatia \n", "2 https://en.wikipedia.org/wiki/Costa_Rica \n", "3 https://en.wikipedia.org/wiki/Democratic_Repub... \n", "4 https://en.wikipedia.org/wiki/Comoros \n", ".. ... \n", "201 https://en.wikipedia.org/wiki/Antigua_and_Barbuda \n", "202 https://en.wikipedia.org/wiki/Angola \n", "203 https://en.wikipedia.org/wiki/Andorra \n", "204 https://en.wikipedia.org/wiki/Algeria \n", "205 https://en.wikipedia.org/wiki/Albania \n", "\n", " flag_description_url \\\n", "0 https://en.wikipedia.org/wiki/Flag_of_Afghanistan \n", "1 https://en.wikipedia.org/wiki/Flag_of_Croatia \n", "2 https://en.wikipedia.org/wiki/Flag_of_Costa_Rica \n", "3 https://en.wikipedia.org/wiki/Flag_of_the_Demo... \n", "4 https://en.wikipedia.org/wiki/Flag_of_Comoros \n", ".. ... \n", "201 https://en.wikipedia.org/wiki/Flag_of_Antigua_... \n", "202 https://en.wikipedia.org/wiki/Flag_of_Angola \n", "203 https://en.wikipedia.org/wiki/Flag_of_Andorra \n", "204 https://en.wikipedia.org/wiki/Flag_of_Algeria \n", "205 https://en.wikipedia.org/wiki/Flag_of_Albania \n", "\n", " short_country_name \\\n", "0 Afghanistan \n", "1 Croatia \n", "2 Costa Rica \n", "3 Democratic Republic of the Congo \n", "4 Comoros \n", ".. ... \n", "201 Antigua and Barbuda \n", "202 Angola \n", "203 Andorra \n", "204 Algeria \n", "205 Albania \n", "\n", " country_html \\\n", "0 ... \n", "1 ... \n", "2 ... \n", "3 ... \n", "4 ... \n", ".. ... \n", "201 ... \n", "202 ... \n", "203 ... \n", "204 ... \n", "205 ... \n", "\n", " flag_html \\\n", "0

The The The The The The The The The The flag of Albania (` tags" ] }, { "cell_type": "code", "execution_count": 25, "id": "ab6166f9-daa2-4591-9989-8d33f3f98533", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T20:45:51.019933Z", "iopub.status.busy": "2022-06-25T20:45:51.019577Z", "iopub.status.idle": "2022-06-25T20:45:51.041851Z", "shell.execute_reply": "2022-06-25T20:45:51.041008Z", "shell.execute_reply.started": "2022-06-25T20:45:51.019903Z" }, "tags": [] }, "outputs": [], "source": [ "countries[\"country_html\"] = countries[\"country_html\"].map(\n", " lambda html: html.replace(\"
\", \"\")\n", ")" ] }, { "cell_type": "markdown", "id": "ce0e413d-1763-4520-bc36-29715963718c", "metadata": {}, "source": [ "### add root node" ] }, { "cell_type": "code", "execution_count": 29, "id": "df924f80-c239-4a35-bc30-888589b34f0b", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T20:47:29.097287Z", "iopub.status.busy": "2022-06-25T20:47:29.096936Z", "iopub.status.idle": "2022-06-25T20:47:29.109510Z", "shell.execute_reply": "2022-06-25T20:47:29.108689Z", "shell.execute_reply.started": "2022-06-25T20:47:29.097258Z" }, "tags": [] }, "outputs": [], "source": [ "countries[\"country_html\"] = countries[\"country_html\"].map(\n", " lambda html: f\"

{html}
\"\n", ")" ] }, { "cell_type": "markdown", "id": "f5e7249e-988c-420e-bd02-f47d96fd0685", "metadata": {}, "source": [ "## parse\n", "\n", "### parse string as xml" ] }, { "cell_type": "code", "execution_count": 32, "id": "691bd250-44b9-4adb-9430-b0ef624c986b", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T20:47:50.180864Z", "iopub.status.busy": "2022-06-25T20:47:50.180516Z", "iopub.status.idle": "2022-06-25T20:47:50.184798Z", "shell.execute_reply": "2022-06-25T20:47:50.183956Z", "shell.execute_reply.started": "2022-06-25T20:47:50.180835Z" }, "tags": [] }, "outputs": [], "source": [ "parser = etree.XMLParser(recover=True)" ] }, { "cell_type": "markdown", "id": "9967cd92-390a-4b1e-a946-32d46a898eb9", "metadata": {}, "source": [ "#### Afganistan" ] }, { "cell_type": "code", "execution_count": 33, "id": "915c338e-9225-46a3-b833-89fdeb971c88", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T20:47:51.179025Z", "iopub.status.busy": "2022-06-25T20:47:51.178417Z", "iopub.status.idle": "2022-06-25T20:47:51.187219Z", "shell.execute_reply": "2022-06-25T20:47:51.186016Z", "shell.execute_reply.started": "2022-06-25T20:47:51.178977Z" }, "tags": [] }, "outputs": [], "source": [ "# root = etree.fromstringlist(countries[\"country_html\"].iloc[0], parser)\n", "root = etree.fromstring(countries[\"country_html\"].iloc[0], parser)" ] }, { "cell_type": "code", "execution_count": 34, "id": "771453ab-2b00-42e5-a026-3bcea7fc6476", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T20:47:52.012467Z", "iopub.status.busy": "2022-06-25T20:47:52.011430Z", "iopub.status.idle": "2022-06-25T20:47:52.020811Z", "shell.execute_reply": "2022-06-25T20:47:52.019658Z", "shell.execute_reply.started": "2022-06-25T20:47:52.012399Z" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "root" ] }, { "cell_type": "code", "execution_count": 35, "id": "516153ae-95b8-44b0-8bd3-51914ba95f4c", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T20:47:52.947860Z", "iopub.status.busy": "2022-06-25T20:47:52.947172Z", "iopub.status.idle": "2022-06-25T20:47:52.955283Z", "shell.execute_reply": "2022-06-25T20:47:52.954038Z", "shell.execute_reply.started": "2022-06-25T20:47:52.947809Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "b'Kabul'\n", "Kabul\n", "\n" ] } ], "source": [ "for element in root.xpath(\"//th[text() = 'Capital']/following-sibling::td/a\"):\n", " print(etree.tostring(element))\n", " print(element.text)\n", " print(element)" ] }, { "cell_type": "markdown", "id": "a9d0a316-fa1b-4b04-9090-ab90f72dee19", "metadata": {}, "source": [ "#### South Africa" ] }, { "cell_type": "code", "execution_count": 36, "id": "769b4492-28a0-4454-8e92-806c4e646660", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T20:47:58.319337Z", "iopub.status.busy": "2022-06-25T20:47:58.317420Z", "iopub.status.idle": "2022-06-25T20:47:58.339980Z", "shell.execute_reply": "2022-06-25T20:47:58.339214Z", "shell.execute_reply.started": "2022-06-25T20:47:58.319276Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
9https://en.wikipedia.org/wiki/Central_African_...https://en.wikipedia.org/wiki/Flag_of_the_Cent...Central African Republic<div><tr><th colspan=\"2\" class=\"infobox-above ...<p>The <b>flag of the <a href=\"/wiki/Central_A...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
58https://en.wikipedia.org/wiki/South_Africahttps://en.wikipedia.org/wiki/Flag_of_South_Af...South Africa<div><tr><th colspan=\"2\" class=\"infobox-above ...<p>The <b>flag of South Africa</b> was designe...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
\n", "
" ], "text/plain": [ " country_url \\\n", "9 https://en.wikipedia.org/wiki/Central_African_... \n", "58 https://en.wikipedia.org/wiki/South_Africa \n", "\n", " flag_description_url \\\n", "9 https://en.wikipedia.org/wiki/Flag_of_the_Cent... \n", "58 https://en.wikipedia.org/wiki/Flag_of_South_Af... \n", "\n", " short_country_name \\\n", "9 Central African Republic \n", "58 South Africa \n", "\n", " country_html \\\n", "9
The flag of the The flag of South Africa was designe... \n", "\n", " file_urls \\\n", "9 [https:////upload.wikimedia.org/wikipedia/comm... \n", "58 [https:////upload.wikimedia.org/wikipedia/comm... \n", "\n", " files \n", "9 [{'url': 'https://upload.wikimedia.org/wikiped... \n", "58 [{'url': 'https://upload.wikimedia.org/wikiped... " ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "countries[countries[\"short_country_name\"].map(lambda country: \"Africa\" in country)]" ] }, { "cell_type": "code", "execution_count": 60, "id": "f06833fe-dee5-42dc-b0ad-71f9759e0fb1", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T21:21:03.803935Z", "iopub.status.busy": "2022-06-25T21:21:03.803558Z", "iopub.status.idle": "2022-06-25T21:21:03.809310Z", "shell.execute_reply": "2022-06-25T21:21:03.808433Z", "shell.execute_reply.started": "2022-06-25T21:21:03.803906Z" }, "tags": [] }, "outputs": [], "source": [ "# root = etree.fromstringlist(countries[\"country_html\"].iloc[58], parser)\n", "root = etree.fromstring(countries[\"country_html\"].iloc[58], parser)" ] }, { "cell_type": "code", "execution_count": 69, "id": "70f25f0c-92bb-43ee-9223-15d1ad9fb691", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T21:22:53.884677Z", "iopub.status.busy": "2022-06-25T21:22:53.884343Z", "iopub.status.idle": "2022-06-25T21:22:53.893752Z", "shell.execute_reply": "2022-06-25T21:22:53.893051Z", "shell.execute_reply.started": "2022-06-25T21:22:53.884650Z" }, "tags": [] }, "outputs": [], "source": [ "for element in root.xpath(\"//th[text() = 'Capital']/following-sibling::td\"):\n", " for capital in element:\n", " capital_filter = capital.xpath(\"ul//a\")\n", " branch_filter = capital.xpath(\"ul//li\")\n", "\n", " _capitals = []\n", " for item in capital_filter:\n", " if not isinstance(re.search(r\"\\d\", item.text), re.Match):\n", " _capitals.append(item.text)\n", " for match in branch_filter:\n", " branch = match.xpath(\"text()\")\n", " _capitals.append(re.search(r\"(?:\\()([^\\)]*)\", branch[0])[1])\n", "\n", " result = {\n", " \"index\": 58,\n", " \"country_name\": countries[\"short_country_name\"].iloc[58],\n", " \"capital\": _capitals,\n", " }" ] }, { "cell_type": "code", "execution_count": 70, "id": "4d6fffaa-c08f-4ae9-b1d4-d1d27550a903", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T21:22:57.761492Z", "iopub.status.busy": "2022-06-25T21:22:57.761115Z", "iopub.status.idle": "2022-06-25T21:22:57.767381Z", "shell.execute_reply": "2022-06-25T21:22:57.766523Z", "shell.execute_reply.started": "2022-06-25T21:22:57.761462Z" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "{'index': 58,\n", " 'country_name': 'South Africa',\n", " 'capital': ['Pretoria',\n", " 'Cape Town',\n", " 'Bloemfontein',\n", " 'executive',\n", " 'legislative',\n", " 'judicial']}" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result" ] }, { "cell_type": "markdown", "id": "8af8a656-3e07-4de0-933e-012d218c18f9", "metadata": {}, "source": [ "#### Albania" ] }, { "cell_type": "code", "execution_count": 41, "id": "33205ffb-1b50-468b-8d5b-3c7302a8b285", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T20:48:07.893538Z", "iopub.status.busy": "2022-06-25T20:48:07.892909Z", "iopub.status.idle": "2022-06-25T20:48:07.905915Z", "shell.execute_reply": "2022-06-25T20:48:07.905138Z", "shell.execute_reply.started": "2022-06-25T20:48:07.893507Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
205https://en.wikipedia.org/wiki/Albaniahttps://en.wikipedia.org/wiki/Flag_of_AlbaniaAlbania<div><tr><th colspan=\"2\" class=\"infobox-above ...<p>The <b>flag of Albania</b> (<a href=\"/wiki/...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
\n", "
" ], "text/plain": [ " country_url \\\n", "205 https://en.wikipedia.org/wiki/Albania \n", "\n", " flag_description_url short_country_name \\\n", "205 https://en.wikipedia.org/wiki/Flag_of_Albania Albania \n", "\n", " country_html \\\n", "205
The flag of Albania (\n" ] } ], "source": [ "for element in root.xpath(\"//th[text() = 'Capital']/following-sibling::td\"):\n", " print(element)" ] }, { "cell_type": "markdown", "id": "5c316764-016f-49be-a3ad-1485dceb5b0c", "metadata": {}, "source": [ "#### Sahrawi Arab Democratic Republic" ] }, { "cell_type": "code", "execution_count": 80, "id": "3dfd7e29-6166-483a-b454-09daadf2ea20", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T21:26:21.907262Z", "iopub.status.busy": "2022-06-25T21:26:21.906430Z", "iopub.status.idle": "2022-06-25T21:26:21.924749Z", "shell.execute_reply": "2022-06-25T21:26:21.923775Z", "shell.execute_reply.started": "2022-06-25T21:26:21.907228Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
20https://en.wikipedia.org/wiki/Sahrawi_Arab_Dem...https://en.wikipedia.org/wiki/Flag_of_Sahrawi_...Sahrawi Arab Democratic Republic<div><tr><th colspan=\"2\" class=\"infobox-above ...<p>The <b>flag of Western Sahara</b> (Arabic: ...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
39https://en.wikipedia.org/wiki/United_Arab_Emir...https://en.wikipedia.org/wiki/Flag_of_the_Unit...United Arab Emirates<div><tr><th colspan=\"2\" class=\"infobox-above ...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
69https://en.wikipedia.org/wiki/Saudi_Arabiahttps://en.wikipedia.org/wiki/Flag_of_Saudi_Ar...Saudi Arabia<div><tr><th colspan=\"2\" class=\"infobox-above ...<p>The <b>flag of the Kingdom of Saudi Arabia<...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
\n", "
" ], "text/plain": [ " country_url \\\n", "20 https://en.wikipedia.org/wiki/Sahrawi_Arab_Dem... \n", "39 https://en.wikipedia.org/wiki/United_Arab_Emir... \n", "69 https://en.wikipedia.org/wiki/Saudi_Arabia \n", "\n", " flag_description_url \\\n", "20 https://en.wikipedia.org/wiki/Flag_of_Sahrawi_... \n", "39 https://en.wikipedia.org/wiki/Flag_of_the_Unit... \n", "69 https://en.wikipedia.org/wiki/Flag_of_Saudi_Ar... \n", "\n", " short_country_name \\\n", "20 Sahrawi Arab Democratic Republic \n", "39 United Arab Emirates \n", "69 Saudi Arabia \n", "\n", " country_html \\\n", "20
The flag of Western Sahara (Arabic: ... \n", "39

The The flag of the Kingdom of Saudi Arabia<... \n", "\n", " file_urls \\\n", "20 [https:////upload.wikimedia.org/wikipedia/comm... \n", "39 [https:////upload.wikimedia.org/wikipedia/comm... \n", "69 [https:////upload.wikimedia.org/wikipedia/comm... \n", "\n", " files \n", "20 [{'url': 'https://upload.wikimedia.org/wikiped... \n", "39 [{'url': 'https://upload.wikimedia.org/wikiped... \n", "69 [{'url': 'https://upload.wikimedia.org/wikiped... " ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "countries[\n", " countries[\"short_country_name\"].map(lambda country: \"arab\" in country.lower())\n", "]" ] }, { "cell_type": "code", "execution_count": 81, "id": "02e1f53a-9081-4edc-9a61-41da56166274", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T21:26:26.463033Z", "iopub.status.busy": "2022-06-25T21:26:26.462631Z", "iopub.status.idle": "2022-06-25T21:26:26.469880Z", "shell.execute_reply": "2022-06-25T21:26:26.469125Z", "shell.execute_reply.started": "2022-06-25T21:26:26.463004Z" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "country_url https://en.wikipedia.org/wiki/Sahrawi_Arab_Dem...\n", "flag_description_url https://en.wikipedia.org/wiki/Flag_of_Sahrawi_...\n", "short_country_name Sahrawi Arab Democratic Republic\n", "country_html

The flag of Western Sahara (Arabic: ...\n", "file_urls [https:////upload.wikimedia.org/wikipedia/comm...\n", "files [{'url': 'https://upload.wikimedia.org/wikiped...\n", "Name: 20, dtype: object" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "countries.iloc[20]" ] }, { "cell_type": "code", "execution_count": 82, "id": "3b74a848-af64-4ea8-b2b2-3b78c9fb843e", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T21:26:39.089757Z", "iopub.status.busy": "2022-06-25T21:26:39.089324Z", "iopub.status.idle": "2022-06-25T21:26:39.095857Z", "shell.execute_reply": "2022-06-25T21:26:39.094902Z", "shell.execute_reply.started": "2022-06-25T21:26:39.089723Z" }, "tags": [] }, "outputs": [], "source": [ "root = etree.fromstring(countries[\"country_html\"].iloc[20], parser)" ] }, { "cell_type": "code", "execution_count": 114, "id": "f8829405-0533-4349-9c99-05e9e1e41607", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T21:49:34.812182Z", "iopub.status.busy": "2022-06-25T21:49:34.811795Z", "iopub.status.idle": "2022-06-25T21:49:34.820550Z", "shell.execute_reply": "2022-06-25T21:49:34.819761Z", "shell.execute_reply.started": "2022-06-25T21:49:34.812153Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "b'
'\n", "b'de jure)'\n", "match de jure\n", "b'de facto)'\n", "match de facto\n" ] } ], "source": [ "for element in root.xpath(\"//th[text() = 'Capital']/following-sibling::td\"):\n", " for capital in element:\n", " print(etree.tostring(capital))\n", " capital_filter = capital.xpath(\"ul//a\")\n", " branch_filter = capital.xpath(\"ul//li//i\")\n", "\n", " _capitals = []\n", " for item in capital_filter:\n", " if not isinstance(re.search(r\"\\d\", item.text), re.Match):\n", " _capitals.append(item.text)\n", " for match in branch_filter:\n", " print(etree.tostring(match))\n", " print(\"match\", match.text)\n", " if isinstance(\n", " re.search(r\"(?:i\\>)([^\\<)]*)\", etree.tostring(match).decode(\"utf-8\")),\n", " re.Match,\n", " ):\n", " _capitals.append(match.text)\n", " result = {\n", " \"index\": 20,\n", " \"country_name\": countries[\"short_country_name\"].iloc[20],\n", " \"capital\": _capitals,\n", " }" ] }, { "cell_type": "code", "execution_count": 115, "id": "92d7c4ac-44f9-46e6-b8ae-94dffa276f84", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T21:49:35.379470Z", "iopub.status.busy": "2022-06-25T21:49:35.378687Z", "iopub.status.idle": "2022-06-25T21:49:35.388295Z", "shell.execute_reply": "2022-06-25T21:49:35.386602Z", "shell.execute_reply.started": "2022-06-25T21:49:35.379399Z" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "{'index': 20,\n", " 'country_name': 'Sahrawi Arab Democratic Republic',\n", " 'capital': ['El Aaiún', 'Tifariti', 'de jure', 'de facto']}" ] }, "execution_count": 115, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result" ] }, { "cell_type": "markdown", "id": "7dbe415a-ba82-4813-9723-ac66ec9b29aa", "metadata": {}, "source": [ "#### State of Palestine" ] }, { "cell_type": "code", "execution_count": 212, "id": "b39a6451-19e0-4ec6-a925-89bcdb89c441", "metadata": { "execution": { "iopub.execute_input": "2022-06-25T23:47:35.380713Z", "iopub.status.busy": "2022-06-25T23:47:35.380097Z", "iopub.status.idle": "2022-06-25T23:47:35.485628Z", "shell.execute_reply": "2022-06-25T23:47:35.484621Z", "shell.execute_reply.started": "2022-06-25T23:47:35.380654Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
87https://en.wikipedia.org/wiki/State_of_Palestinehttps://en.wikipedia.org/wiki/Flag_of_PalestineState of Palestine<div><tr><th colspan=\"2\" class=\"infobox-above ...<p>The <b>flag of Palestine</b> (<a href=\"/wik...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
\n", "
" ], "text/plain": [ " country_url \\\n", "87 https://en.wikipedia.org/wiki/State_of_Palestine \n", "\n", " flag_description_url short_country_name \\\n", "87 https://en.wikipedia.org/wiki/Flag_of_Palestine State of Palestine \n", "\n", " country_html \\\n", "87
The flag of Palestine ()([^\\<)]*)\", etree.tostring(match).decode(\"utf-8\")\n", " ),\n", " re.Match,\n", " ):\n", " _capitals.append(match.text)\n", " result = {\n", " \"index\": index,\n", " \"country_name\": country_name,\n", " \"capital\": _capitals,\n", " }\n", "\n", " # proclaimed capitals: e.g Palestine\n", " if result is None:\n", " for element in root.xpath(\n", " \"//th/div/ul/li[text() = 'Proclaimed capital']/following::td[1]//li[1]/a[1]\"\n", " ):\n", " result = {\n", " \"index\": index,\n", " \"country_name\": country_name,\n", " \"capital\": element.text,\n", " }\n", "\n", " return result or None" ] }, { "cell_type": "code", "execution_count": 273, "id": "0b39d1c7-a1ee-4031-a0eb-93ab5ef525ff", "metadata": { "execution": { "iopub.execute_input": "2022-06-26T00:06:10.855929Z", "iopub.status.busy": "2022-06-26T00:06:10.855004Z", "iopub.status.idle": "2022-06-26T00:06:11.012760Z", "shell.execute_reply": "2022-06-26T00:06:11.011838Z", "shell.execute_reply.started": "2022-06-26T00:06:10.855859Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'index': 23, 'country_name': 'Kosovo', 'capital': []}\n", "33 Vatican City\n", "64 Singapore\n", "{'index': 201, 'country_name': 'Antigua and Barbuda', 'capital': []}\n" ] } ], "source": [ "for index, country_name, country_html in zip(\n", " countries.index, countries[\"short_country_name\"], countries[\"country_html\"]\n", "):\n", " result = extract_capital_0(index, country_name, country_html)\n", " try:\n", " if len(result[\"capital\"]) == 0:\n", " print(result)\n", " except TypeError:\n", " print(index, country_name)\n", "\n", " # print(json.dumps(extract_capital_0(index, country_name, country_html)))" ] }, { "cell_type": "code", "execution_count": null, "id": "8c147ae2-cc6f-4ac9-9c6a-81fb8e71d1d1", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 5 }