# Capital extraction

In [1]:
import json
import pathlib
import re
import xml.etree.ElementTree as ET

import pandas as pd
from lxml import etree

## load data

### load the raw countries data

In [2]:
data_directory = (
    pathlib.Path(".").resolve().parents[1] / "data" / "scrapy" / "raw_country_data"
)

In [12]:
countries = pd.read_json(data_directory / "countries.json")
countries

Unnamed: 0,country_url,flag_description_url,short_country_name,country_html,flag_html,file_urls,files
0,https://en.wikipedia.org/wiki/Afghanistan,https://en.wikipedia.org/wiki/Flag_of_Afghanistan,Afghanistan,"[<tr><th colspan=""2"" class=""infobox-above adr""...","<p>The <a href=""/wiki/National_flag"" title=""Na...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
1,https://en.wikipedia.org/wiki/Croatia,https://en.wikipedia.org/wiki/Flag_of_Croatia,Croatia,"[<tr><th colspan=""2"" class=""infobox-above adr""...","<p>The <a href=""/wiki/National_flag"" title=""Na...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
2,https://en.wikipedia.org/wiki/Costa_Rica,https://en.wikipedia.org/wiki/Flag_of_Costa_Rica,Costa_Rica,"[<tr><th colspan=""2"" class=""infobox-above adr""...","<p>The <b><a href=""/wiki/National_flag"" title=...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
3,https://en.wikipedia.org/wiki/Democratic_Repub...,https://en.wikipedia.org/wiki/Flag_of_the_Demo...,Democratic_Republic_of_the_Congo,"[<tr><th colspan=""2"" class=""infobox-above adr""...","<p>The <a href=""/wiki/National_flag"" title=""Na...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
4,https://en.wikipedia.org/wiki/Comoros,https://en.wikipedia.org/wiki/Flag_of_Comoros,Comoros,"[<tr><th colspan=""2"" class=""infobox-above adr""...","<p>The <b><a href=""/wiki/National_flag"" title=...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
...,...,...,...,...,...,...,...
201,https://en.wikipedia.org/wiki/Antigua_and_Barbuda,https://en.wikipedia.org/wiki/Flag_of_Antigua_...,Antigua_and_Barbuda,"[<tr><th colspan=""2"" class=""infobox-above adr""...","<p>The <a href=""/wiki/National_flag"" title=""Na...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
202,https://en.wikipedia.org/wiki/Angola,https://en.wikipedia.org/wiki/Flag_of_Angola,Angola,"[<tr><th colspan=""2"" class=""infobox-above adr""...","<p>The <a href=""/wiki/National_flag"" title=""Na...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
203,https://en.wikipedia.org/wiki/Andorra,https://en.wikipedia.org/wiki/Flag_of_Andorra,Andorra,"[<tr><th colspan=""2"" class=""infobox-above adr""...","<p>The <a href=""/wiki/National_flag"" title=""Na...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
204,https://en.wikipedia.org/wiki/Algeria,https://en.wikipedia.org/wiki/Flag_of_Algeria,Algeria,"[<tr><th colspan=""2"" class=""infobox-above adr""...","<p>The <a href=""/wiki/National_flag"" title=""Na...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...


## cleaning

### clean `short_country_name`

In [13]:
countries["short_country_name"] = countries["short_country_name"].map(
    lambda country: country.replace("_", " ")
)

In [14]:
countries

Unnamed: 0,country_url,flag_description_url,short_country_name,country_html,flag_html,file_urls,files
0,https://en.wikipedia.org/wiki/Afghanistan,https://en.wikipedia.org/wiki/Flag_of_Afghanistan,Afghanistan,"[<tr><th colspan=""2"" class=""infobox-above adr""...","<p>The <a href=""/wiki/National_flag"" title=""Na...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
1,https://en.wikipedia.org/wiki/Croatia,https://en.wikipedia.org/wiki/Flag_of_Croatia,Croatia,"[<tr><th colspan=""2"" class=""infobox-above adr""...","<p>The <a href=""/wiki/National_flag"" title=""Na...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
2,https://en.wikipedia.org/wiki/Costa_Rica,https://en.wikipedia.org/wiki/Flag_of_Costa_Rica,Costa Rica,"[<tr><th colspan=""2"" class=""infobox-above adr""...","<p>The <b><a href=""/wiki/National_flag"" title=...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
3,https://en.wikipedia.org/wiki/Democratic_Repub...,https://en.wikipedia.org/wiki/Flag_of_the_Demo...,Democratic Republic of the Congo,"[<tr><th colspan=""2"" class=""infobox-above adr""...","<p>The <a href=""/wiki/National_flag"" title=""Na...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
4,https://en.wikipedia.org/wiki/Comoros,https://en.wikipedia.org/wiki/Flag_of_Comoros,Comoros,"[<tr><th colspan=""2"" class=""infobox-above adr""...","<p>The <b><a href=""/wiki/National_flag"" title=...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
...,...,...,...,...,...,...,...
201,https://en.wikipedia.org/wiki/Antigua_and_Barbuda,https://en.wikipedia.org/wiki/Flag_of_Antigua_...,Antigua and Barbuda,"[<tr><th colspan=""2"" class=""infobox-above adr""...","<p>The <a href=""/wiki/National_flag"" title=""Na...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
202,https://en.wikipedia.org/wiki/Angola,https://en.wikipedia.org/wiki/Flag_of_Angola,Angola,"[<tr><th colspan=""2"" class=""infobox-above adr""...","<p>The <a href=""/wiki/National_flag"" title=""Na...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
203,https://en.wikipedia.org/wiki/Andorra,https://en.wikipedia.org/wiki/Flag_of_Andorra,Andorra,"[<tr><th colspan=""2"" class=""infobox-above adr""...","<p>The <a href=""/wiki/National_flag"" title=""Na...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
204,https://en.wikipedia.org/wiki/Algeria,https://en.wikipedia.org/wiki/Flag_of_Algeria,Algeria,"[<tr><th colspan=""2"" class=""infobox-above adr""...","<p>The <a href=""/wiki/National_flag"" title=""Na...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...


### convert `country_html` to single string

In [15]:
countries["country_html"] = countries["country_html"].map(lambda html: "".join(html))
countries

Unnamed: 0,country_url,flag_description_url,short_country_name,country_html,flag_html,file_urls,files
0,https://en.wikipedia.org/wiki/Afghanistan,https://en.wikipedia.org/wiki/Flag_of_Afghanistan,Afghanistan,"<tr><th colspan=""2"" class=""infobox-above adr"">...","<p>The <a href=""/wiki/National_flag"" title=""Na...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
1,https://en.wikipedia.org/wiki/Croatia,https://en.wikipedia.org/wiki/Flag_of_Croatia,Croatia,"<tr><th colspan=""2"" class=""infobox-above adr"">...","<p>The <a href=""/wiki/National_flag"" title=""Na...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
2,https://en.wikipedia.org/wiki/Costa_Rica,https://en.wikipedia.org/wiki/Flag_of_Costa_Rica,Costa Rica,"<tr><th colspan=""2"" class=""infobox-above adr"">...","<p>The <b><a href=""/wiki/National_flag"" title=...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
3,https://en.wikipedia.org/wiki/Democratic_Repub...,https://en.wikipedia.org/wiki/Flag_of_the_Demo...,Democratic Republic of the Congo,"<tr><th colspan=""2"" class=""infobox-above adr"">...","<p>The <a href=""/wiki/National_flag"" title=""Na...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
4,https://en.wikipedia.org/wiki/Comoros,https://en.wikipedia.org/wiki/Flag_of_Comoros,Comoros,"<tr><th colspan=""2"" class=""infobox-above adr"">...","<p>The <b><a href=""/wiki/National_flag"" title=...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
...,...,...,...,...,...,...,...
201,https://en.wikipedia.org/wiki/Antigua_and_Barbuda,https://en.wikipedia.org/wiki/Flag_of_Antigua_...,Antigua and Barbuda,"<tr><th colspan=""2"" class=""infobox-above adr"">...","<p>The <a href=""/wiki/National_flag"" title=""Na...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
202,https://en.wikipedia.org/wiki/Angola,https://en.wikipedia.org/wiki/Flag_of_Angola,Angola,"<tr><th colspan=""2"" class=""infobox-above adr"">...","<p>The <a href=""/wiki/National_flag"" title=""Na...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
203,https://en.wikipedia.org/wiki/Andorra,https://en.wikipedia.org/wiki/Flag_of_Andorra,Andorra,"<tr><th colspan=""2"" class=""infobox-above adr"">...","<p>The <a href=""/wiki/National_flag"" title=""Na...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
204,https://en.wikipedia.org/wiki/Algeria,https://en.wikipedia.org/wiki/Flag_of_Algeria,Algeria,"<tr><th colspan=""2"" class=""infobox-above adr"">...","<p>The <a href=""/wiki/National_flag"" title=""Na...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...


### remove `<br>` tags

In [25]:
countries["country_html"] = countries["country_html"].map(
    lambda html: html.replace("<br>", "")
)

### add root node

In [29]:
countries["country_html"] = countries["country_html"].map(
    lambda html: f"<div>{html}</div>"
)

## parse

### parse string as xml

In [32]:
parser = etree.XMLParser(recover=True)

#### Afganistan

In [33]:
# root = etree.fromstringlist(countries["country_html"].iloc[0], parser)
root = etree.fromstring(countries["country_html"].iloc[0], parser)

In [34]:
root

<Element div at 0x123031f80>

In [35]:
for element in root.xpath("//th[text() = 'Capital']/following-sibling::td/a"):
    print(etree.tostring(element))
    print(element.text)
    print(element)

b'<a href="/wiki/Kabul" title="Kabul">Kabul</a>'
Kabul
<Element a at 0x122f81100>


#### South Africa

In [36]:
countries[countries["short_country_name"].map(lambda country: "Africa" in country)]

Unnamed: 0,country_url,flag_description_url,short_country_name,country_html,flag_html,file_urls,files
9,https://en.wikipedia.org/wiki/Central_African_...,https://en.wikipedia.org/wiki/Flag_of_the_Cent...,Central African Republic,"<div><tr><th colspan=""2"" class=""infobox-above ...","<p>The <b>flag of the <a href=""/wiki/Central_A...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
58,https://en.wikipedia.org/wiki/South_Africa,https://en.wikipedia.org/wiki/Flag_of_South_Af...,South Africa,"<div><tr><th colspan=""2"" class=""infobox-above ...",<p>The <b>flag of South Africa</b> was designe...,[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...


In [60]:
# root = etree.fromstringlist(countries["country_html"].iloc[58], parser)
root = etree.fromstring(countries["country_html"].iloc[58], parser)

In [69]:
for element in root.xpath("//th[text() = 'Capital']/following-sibling::td"):
    for capital in element:
        capital_filter = capital.xpath("ul//a")
        branch_filter = capital.xpath("ul//li")

        _capitals = []
        for item in capital_filter:
            if not isinstance(re.search(r"\d", item.text), re.Match):
                _capitals.append(item.text)
        for match in branch_filter:
            branch = match.xpath("text()")
            _capitals.append(re.search(r"(?:\()([^\)]*)", branch[0])[1])

    result = {
        "index": 58,
        "country_name": countries["short_country_name"].iloc[58],
        "capital": _capitals,
    }

In [70]:
result

{'index': 58,
 'country_name': 'South Africa',
 'capital': ['Pretoria',
  'Cape Town',
  'Bloemfontein',
  'executive',
  'legislative',
  'judicial']}

#### Albania

In [41]:
countries[countries["short_country_name"].map(lambda country: "Albania" in country)]

Unnamed: 0,country_url,flag_description_url,short_country_name,country_html,flag_html,file_urls,files
205,https://en.wikipedia.org/wiki/Albania,https://en.wikipedia.org/wiki/Flag_of_Albania,Albania,"<div><tr><th colspan=""2"" class=""infobox-above ...","<p>The <b>flag of Albania</b> (<a href=""/wiki/...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...


In [43]:
root = etree.fromstring(countries["country_html"].iloc[205], parser)

In [45]:
for element in root.xpath("//th[text() = 'Capital']/following-sibling::td"):
    print(element)

<Element td at 0x123fc9840>


#### Sahrawi Arab Democratic Republic

In [80]:
countries[
    countries["short_country_name"].map(lambda country: "arab" in country.lower())
]

Unnamed: 0,country_url,flag_description_url,short_country_name,country_html,flag_html,file_urls,files
20,https://en.wikipedia.org/wiki/Sahrawi_Arab_Dem...,https://en.wikipedia.org/wiki/Flag_of_Sahrawi_...,Sahrawi Arab Democratic Republic,"<div><tr><th colspan=""2"" class=""infobox-above ...",<p>The <b>flag of Western Sahara</b> (Arabic: ...,[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
39,https://en.wikipedia.org/wiki/United_Arab_Emir...,https://en.wikipedia.org/wiki/Flag_of_the_Unit...,United Arab Emirates,"<div><tr><th colspan=""2"" class=""infobox-above ...","<p>The <a href=""/wiki/National_flag"" title=""Na...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...
69,https://en.wikipedia.org/wiki/Saudi_Arabia,https://en.wikipedia.org/wiki/Flag_of_Saudi_Ar...,Saudi Arabia,"<div><tr><th colspan=""2"" class=""infobox-above ...",<p>The <b>flag of the Kingdom of Saudi Arabia<...,[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...


In [81]:
countries.iloc[20]

country_url             https://en.wikipedia.org/wiki/Sahrawi_Arab_Dem...
flag_description_url    https://en.wikipedia.org/wiki/Flag_of_Sahrawi_...
short_country_name                       Sahrawi Arab Democratic Republic
country_html            <div><tr><th colspan="2" class="infobox-above ...
flag_html               <p>The <b>flag of Western Sahara</b> (Arabic: ...
file_urls               [https:////upload.wikimedia.org/wikipedia/comm...
files                   [{'url': 'https://upload.wikimedia.org/wikiped...
Name: 20, dtype: object

In [82]:
root = etree.fromstring(countries["country_html"].iloc[20], parser)

In [114]:
for element in root.xpath("//th[text() = 'Capital']/following-sibling::td"):
    for capital in element:
        print(etree.tostring(capital))
        capital_filter = capital.xpath("ul//a")
        branch_filter = capital.xpath("ul//li//i")

        _capitals = []
        for item in capital_filter:
            if not isinstance(re.search(r"\d", item.text), re.Match):
                _capitals.append(item.text)
        for match in branch_filter:
            print(etree.tostring(match))
            print("match", match.text)
            if isinstance(
                re.search(r"(?:i\>)([^\<)]*)", etree.tostring(match).decode("utf-8")),
                re.Match,
            ):
                _capitals.append(match.text)
    result = {
        "index": 20,
        "country_name": countries["short_country_name"].iloc[20],
        "capital": _capitals,
    }

b'<div class="plainlist"><ul><li><a href="/wiki/Laayoune" title="Laayoune">El Aai&#250;n</a><sup>a</sup> (<i>de jure</i>)</li><li><span class="nowrap"><a href="/wiki/Tifariti" title="Tifariti">Tifariti</a> (<i>de facto</i>)</span></li></ul></div>'
b'<i>de jure</i>)'
match de jure
b'<i>de facto</i>)'
match de facto


In [115]:
result

{'index': 20,
 'country_name': 'Sahrawi Arab Democratic Republic',
 'capital': ['El Aaiún', 'Tifariti', 'de jure', 'de facto']}

#### State of Palestine

In [212]:
countries[
    countries["short_country_name"].map(lambda country: "palestine" in country.lower())
]

Unnamed: 0,country_url,flag_description_url,short_country_name,country_html,flag_html,file_urls,files
87,https://en.wikipedia.org/wiki/State_of_Palestine,https://en.wikipedia.org/wiki/Flag_of_Palestine,State of Palestine,"<div><tr><th colspan=""2"" class=""infobox-above ...","<p>The <b>flag of Palestine</b> (<a href=""/wik...",[https:////upload.wikimedia.org/wikipedia/comm...,[{'url': 'https://upload.wikimedia.org/wikiped...


In [213]:
root = etree.fromstring(countries["country_html"].iloc[87], parser)

In [262]:
for element in root.xpath(
    "//th/div/ul/li[text() = 'Proclaimed capital']/following::td[1]//li[1]/a[1]"
):
    print(element.text)

Jerusalem


## define functions

In [271]:
def extract_capital_0(index: int, country_name: str, country_html: str):
    result = None
    root = etree.fromstring(country_html, parser)

    # matches single capital
    for element in root.xpath("//th[text() = 'Capital']/following-sibling::td/a"):
        result = {"index": index, "country_name": country_name, "capital": element.text}

    # matches multiple capitals
    if result is None:
        for element in root.xpath("//th[text() = 'Capital']/following-sibling::td"):
            capital_filter = element.xpath("div//a")
            branch_filter = element.xpath("div//ul/li/text()")
            _capitals = []
            for root in capital_filter:
                if root.text is not None:
                    _capitals.append(root.text)
            for root in branch_filter:
                if isinstance(
                    res := re.search(r"(?:\()([^/)]*)", str(root).strip()), re.Match
                ):
                    _capitals.append(res[1])
            result = {
                "index": index,
                "country_name": country_name,
                "capital": _capitals,
            }

    # matches mutiple capitals with italic footnote
    if result is None:
        for element in root.xpath("//th[text() = 'Capital']/following-sibling::td"):
            for capital in element:
                capital_filter = capital.xpath("ul//a")
                branch_filter = capital.xpath("ul//li//i")

                _capitals = []
                for item in capital_filter:
                    if not isinstance(re.search(r"\d", item.text), re.Match):
                        _capitals.append(item.text)
                for match in branch_filter:
                    if isinstance(
                        re.search(
                            r"(?:i\>)([^\<)]*)", etree.tostring(match).decode("utf-8")
                        ),
                        re.Match,
                    ):
                        _capitals.append(match.text)
            result = {
                "index": index,
                "country_name": country_name,
                "capital": _capitals,
            }

    # proclaimed capitals: e.g Palestine
    if result is None:
        for element in root.xpath(
            "//th/div/ul/li[text() = 'Proclaimed capital']/following::td[1]//li[1]/a[1]"
        ):
            result = {
                "index": index,
                "country_name": country_name,
                "capital": element.text,
            }

    return result or None

In [273]:
for index, country_name, country_html in zip(
    countries.index, countries["short_country_name"], countries["country_html"]
):
    result = extract_capital_0(index, country_name, country_html)
    try:
        if len(result["capital"]) == 0:
            print(result)
    except TypeError:
        print(index, country_name)

    # print(json.dumps(extract_capital_0(index, country_name, country_html)))

{'index': 23, 'country_name': 'Kosovo', 'capital': []}
33 Vatican City
64 Singapore
{'index': 201, 'country_name': 'Antigua and Barbuda', 'capital': []}
