From 101f4a4080e74e99f2e34800bd9a197e32d9426e Mon Sep 17 00:00:00 2001 From: Daniel Tomlinson Date: Sat, 25 Jun 2022 23:22:59 +0100 Subject: [PATCH] chore: update playground --- .../spiders/countrydownloader_old_anthem.py | 122 - docs/deck_readme.md | 12 + .../Untitled.ipynb | 499 ---- .../capital_xpath.ipynb | 2149 +++++++++++++++++ .../exploration.ipynb | 599 +++++ 5 files changed, 2760 insertions(+), 621 deletions(-) delete mode 100644 01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader_old_anthem.py create mode 100644 docs/deck_readme.md delete mode 100644 playground/downloaded_data_inspection_lab/Untitled.ipynb create mode 100644 playground/downloaded_data_inspection_lab/capital_xpath.ipynb create mode 100644 playground/downloaded_data_inspection_lab/exploration.ipynb diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader_old_anthem.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader_old_anthem.py deleted file mode 100644 index 23acf29..0000000 --- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader_old_anthem.py +++ /dev/null @@ -1,122 +0,0 @@ -# from __future__ import annotations - -# import re - -# import scrapy -# from scrapy.http import TextResponse - -# from wikipedia_country_scraper.items import WikipediaCountryScraperItem - - -# class CountrydownloaderSpider(scrapy.Spider): -# name = "CountrydownloaderSpider" - -# def start_requests(self): -# return [ -# scrapy.Request( -# url="https://en.wikipedia.org/wiki/List_of_sovereign_states", callback=self.extract_country_urls -# ) -# ] - -# def extract_country_urls(self, response: TextResponse): -# country_urls_xpath = response.xpath( -# "//table[contains(@class, 'sortable') and contains(@class, 'wikitable')]/tbody/tr[not(contains(@style, 'background'))]/td[1 and contains(@style, 'vertical-align:top;')]/b/a/@href" -# ).getall() - -# for url in country_urls_xpath: -# # for url in country_urls_xpath[:3]: -# regex_match = re.search(r"\/wiki\/(?P[^$]*)", url) -# yield scrapy.Request( -# url=f"https://en.wikipedia.org{url}", -# callback=self.extract_country_information, -# cb_kwargs={ -# "country_item": { -# "country_url": f"https://en.wikipedia.org{url}", -# "short_country_name": regex_match["short_country_name"] -# if isinstance(regex_match, re.Match) -# else None, -# } -# }, -# ) - -# def extract_country_information(self, response: TextResponse, country_item: dict): -# country_information_xpath = response.xpath("//table[contains(@class, 'infobox')]/tbody/tr").getall() - -# flag_image_url = response.xpath( -# "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[1]/a/@href" -# ).get() -# flag_description_url = response.xpath( -# "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[2]/a/@href" -# ).get() - -# anthem_page_url = response.xpath( -# "//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//span[contains(@class, 'audio')]/a/@href" -# ).get() - -# country_item = { -# **country_item, -# "country": country_information_xpath, -# } - -# yield scrapy.Request( -# url=f"https://en.wikipedia.org{flag_description_url}", -# callback=self.extract_flag_description, -# cb_kwargs={ -# "country_item": country_item, -# "urls": { -# "flag_image_url": f"https://en.wikipedia.org{flag_image_url}", -# "anthem_page_url": f"https://en.wikipedia.org{anthem_page_url}", -# }, -# }, -# ) - -# def extract_flag_description(self, response: TextResponse, country_item: dict, urls: dict): -# flag_description_xpath = response.xpath( -# "//div[contains(@id, 'mw-content-text')]/div/p[not(contains(@class, 'mw-empty-elt'))]" -# ).get() -# country_item = {**country_item, "flag_description": flag_description_xpath} - -# yield scrapy.Request( -# url=urls["flag_image_url"], -# callback=self.extract_flag_images, -# cb_kwargs={ -# "country_item": country_item, -# "urls": urls, -# }, -# ) - -# def extract_flag_images(self, response: TextResponse, country_item: dict, urls: dict): -# flag_image_xpath = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get() -# country_item = {**country_item, "flag_image_url": f"https:{flag_image_xpath}"} - -# yield scrapy.Request( -# url=urls["anthem_page_url"], -# callback=self.extract_anthem_file, -# cb_kwargs={ -# "country_item": country_item, -# "urls": urls, -# }, -# ) - -# def extract_anthem_file(self, response: TextResponse, country_item: dict, urls: dict): -# anthem_text = response.xpath("//div[@id='mw-content-text']/div/p").get() -# _anthem_file_url = response.xpath("//tr[contains(@class, 'haudio')]//a/@href").getall() - -# anthem_file_url = next( -# (file for file in _anthem_file_url if file.endswith(".ogg") or file.endswith(".oga")), None -# ) - -# country_scrapy_item = WikipediaCountryScraperItem() -# country_scrapy_item["country_url"] = country_item["country_url"] -# country_scrapy_item["short_country_name"] = country_item["short_country_name"] -# country_scrapy_item["country"] = country_item["country"] -# country_scrapy_item["flag_description"] = country_item["flag_description"] -# country_scrapy_item["anthem"] = anthem_text -# country_scrapy_item["anthem_url"] = urls["anthem_page_url"] -# country_scrapy_item["anthem_file_url"] = f"https://en.wikipedia.org{anthem_file_url}" -# country_scrapy_item["file_urls"] = [ -# country_item["flag_image_url"], -# f"https://en.wikipedia.org{anthem_file_url}", -# ] - -# yield country_scrapy_item diff --git a/docs/deck_readme.md b/docs/deck_readme.md new file mode 100644 index 0000000..b719fa8 --- /dev/null +++ b/docs/deck_readme.md @@ -0,0 +1,12 @@ + +Although there are many really good geography focused decks already, some are out of date, or contain way more information than needed to learn the topic at hand. + +For a deck to learn Capital cities/flags, having extra information like the population, language etc. is nice to have. But I personally find it distracting to have this information on a card. When learning Capital cities I just want the country and capital - nothing more. + +The idea behind these decks: + +- Should be up to date and current. +- Should use an authorative source (i.e Wikipedia/Google Maps). +- Should be easy to update. +- Should be automatically generated - no user tweaking required to get the core deck. +- Should be open source - the code is freely available with detailed instructions at . diff --git a/playground/downloaded_data_inspection_lab/Untitled.ipynb b/playground/downloaded_data_inspection_lab/Untitled.ipynb deleted file mode 100644 index 896acc4..0000000 --- a/playground/downloaded_data_inspection_lab/Untitled.ipynb +++ /dev/null @@ -1,499 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "d8185790-0793-4881-99e8-6730f95a8006", - "metadata": { - "execution": { - "iopub.execute_input": "2022-06-24T22:04:54.386982Z", - "iopub.status.busy": "2022-06-24T22:04:54.386313Z", - "iopub.status.idle": "2022-06-24T22:04:54.854521Z", - "shell.execute_reply": "2022-06-24T22:04:54.853581Z", - "shell.execute_reply.started": "2022-06-24T22:04:54.386910Z" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import json\n", - "import pathlib\n", - "\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "ea2b3e33-d58e-4e30-a0cc-8218a1f252c9", - "metadata": { - "execution": { - "iopub.execute_input": "2022-06-24T22:04:55.458615Z", - "iopub.status.busy": "2022-06-24T22:04:55.457695Z", - "iopub.status.idle": "2022-06-24T22:04:55.475878Z", - "shell.execute_reply": "2022-06-24T22:04:55.474706Z", - "shell.execute_reply.started": "2022-06-24T22:04:55.458548Z" - }, - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[None]" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pd_options = {\n", - " \"display.max_rows\": None,\n", - "}\n", - "\n", - "[pd.set_option(option, value) for option, value in pd_options.items()]" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "36149580-91d9-431d-99c3-51feee829e79", - "metadata": { - "execution": { - "iopub.execute_input": "2022-06-24T22:04:56.134416Z", - "iopub.status.busy": "2022-06-24T22:04:56.133745Z", - "iopub.status.idle": "2022-06-24T22:04:56.140326Z", - "shell.execute_reply": "2022-06-24T22:04:56.138507Z", - "shell.execute_reply.started": "2022-06-24T22:04:56.134371Z" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "data_directory = (\n", - " pathlib.Path(\".\").resolve().parents[1] / \"data\" / \"scrapy\" / \"raw_country_data\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "d03be94e-8642-4916-8a43-1711e0c21b36", - "metadata": { - "execution": { - "iopub.execute_input": "2022-06-24T22:04:56.621163Z", - "iopub.status.busy": "2022-06-24T22:04:56.620692Z", - "iopub.status.idle": "2022-06-24T22:04:56.731001Z", - "shell.execute_reply": "2022-06-24T22:04:56.728392Z", - "shell.execute_reply.started": "2022-06-24T22:04:56.621128Z" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "countries_file = data_directory / \"countries.json\"\n", - "countries = json.loads(countries_file.read_text())\n", - "# countries" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "29cca9ea-16d3-4534-8c9e-49fde37f8cdd", - "metadata": { - "execution": { - "iopub.execute_input": "2022-06-24T22:04:57.257218Z", - "iopub.status.busy": "2022-06-24T22:04:57.256573Z", - "iopub.status.idle": "2022-06-24T22:04:57.333032Z", - "shell.execute_reply": "2022-06-24T22:04:57.332120Z", - "shell.execute_reply.started": "2022-06-24T22:04:57.257174Z" - }, - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['country_url', 'flag_description_url', 'short_country_name',\n", - " 'country_html', 'flag_html', 'file_urls', 'files'],\n", - " dtype='object')" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.read_json(countries_file)\n", - "df.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "ef8bc3ce-08dd-4260-807c-2616b2e1c1ba", - "metadata": { - "execution": { - "iopub.execute_input": "2022-06-24T22:04:59.223608Z", - "iopub.status.busy": "2022-06-24T22:04:59.222961Z", - "iopub.status.idle": "2022-06-24T22:04:59.229384Z", - "shell.execute_reply": "2022-06-24T22:04:59.228618Z", - "shell.execute_reply.started": "2022-06-24T22:04:59.223578Z" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(206,)\n", - "[False]\n", - "[False]\n" - ] - } - ], - "source": [ - "country_url = df[\"country_url\"]\n", - "print(country_url.shape)\n", - "print(country_url.isnull().unique())\n", - "print(country_url.isna().unique())" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "48db8f93-659b-45a4-8477-a7cec139bebc", - "metadata": { - "execution": { - "iopub.execute_input": "2022-06-24T22:04:59.710467Z", - "iopub.status.busy": "2022-06-24T22:04:59.709874Z", - "iopub.status.idle": "2022-06-24T22:04:59.720517Z", - "shell.execute_reply": "2022-06-24T22:04:59.717623Z", - "shell.execute_reply.started": "2022-06-24T22:04:59.710431Z" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(206,)\n", - "[False]\n", - "[False]\n" - ] - } - ], - "source": [ - "short_country_name = df[\"short_country_name\"]\n", - "print(short_country_name.shape)\n", - "print(short_country_name.isnull().unique())\n", - "print(short_country_name.isna().unique())" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "a52f6aa2-5bbd-46e4-9b2f-cdbd7269cb6e", - "metadata": { - "execution": { - "iopub.execute_input": "2022-06-24T22:04:59.950051Z", - "iopub.status.busy": "2022-06-24T22:04:59.949622Z", - "iopub.status.idle": "2022-06-24T22:04:59.956484Z", - "shell.execute_reply": "2022-06-24T22:04:59.955471Z", - "shell.execute_reply.started": "2022-06-24T22:04:59.950016Z" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(206,)\n", - "[False]\n", - "[False]\n" - ] - } - ], - "source": [ - "flag_html = df[\"flag_html\"]\n", - "print(flag_html.shape)\n", - "print(flag_html.isnull().unique())\n", - "print(flag_html.isna().unique())" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "643e6512-1e5b-4eb2-9f0a-6b680ada787b", - "metadata": { - "execution": { - "iopub.execute_input": "2022-06-24T22:05:00.166633Z", - "iopub.status.busy": "2022-06-24T22:05:00.166278Z", - "iopub.status.idle": "2022-06-24T22:05:00.178277Z", - "shell.execute_reply": "2022-06-24T22:05:00.177378Z", - "shell.execute_reply.started": "2022-06-24T22:05:00.166609Z" - }, - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [country_url, flag_description_url, short_country_name, country_html, flag_html, file_urls, files]\n", - "Index: []" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df[\"flag_html\"].isnull()]" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "5e21e98a-56ba-4e55-b5d4-89dab2232c29", - "metadata": { - "execution": { - "iopub.execute_input": "2022-06-24T22:05:00.714817Z", - "iopub.status.busy": "2022-06-24T22:05:00.714232Z", - "iopub.status.idle": "2022-06-24T22:05:00.728680Z", - "shell.execute_reply": "2022-06-24T22:05:00.727307Z", - "shell.execute_reply.started": "2022-06-24T22:05:00.714774Z" - }, - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [country_url, flag_description_url, short_country_name, country_html, flag_html, file_urls, files]\n", - "Index: []" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df[\"flag_html\"].isna()]" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "227b0c76-9e45-4849-849e-36355976cba9", - "metadata": { - "execution": { - "iopub.execute_input": "2022-06-24T22:13:34.716780Z", - "iopub.status.busy": "2022-06-24T22:13:34.716226Z", - "iopub.status.idle": "2022-06-24T22:13:34.734266Z", - "shell.execute_reply": "2022-06-24T22:13:34.733297Z", - "shell.execute_reply.started": "2022-06-24T22:13:34.716742Z" - }, - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "country_url https://en.wikipedia.org/wiki/Paraguay\n", - "flag_description_url https://en.wikipedia.org/wiki/Flag_of_Paraguay\n", - "short_country_name Paraguay\n", - "country_html [The flag of
Islamic Emirate of Afghanistan
\\n
\\n
\"Flag
\\n \\n
\\n
\\n
\"Coat
\\n \\n
\\n
Anthem:\\xa0دا د باتورانو کور
Dā Də Bātorāno Kor
\"This is the Home of the Brave\"[2]
\"AfghanistanAfghanistan on the globe
\"AfghanistanMap of Afghanistan
StatusUN member state under an unrecognized government[3]Capital
and largest city
Kabul
34°31′N 69°11′E\\ufeff / \\ufeff34.517°N 69.183°E\\ufeff / 34.517; 69.183Coordinates: 34°31′N 69°11′E\\ufeff / \\ufeff34.517°N 69.183°E\\ufeff / 34.517; 69.183[4]Major languagesEthnic\\xa0groups
(2019 unofficial estimates)[a][6][7][8][9]
Religion
Demonym(s)Afghan[b][13][14]GovernmentUnitary provisional theocratic Islamic emirate[15][16][17]
•\\xa0Leader
Hibatullah Akhundzada
•\\xa0Prime Minister
Hasan Akhund (acting)
•\\xa0Chief Justice
Abdul Hakim Ishaqzai\\nLegislatureLeadership Council (consultative body)[18]Formation
•\\xa0Hotak Empire
17091738
•\\xa0Durrani Empire
1747–1823
•\\xa0Emirate
1823–1839
•\\xa0Restoration of the Durrani Kingdom
1839–1842
•\\xa0Restoration of the Emirate
1842–1926
•\\xa0Dost Mohammad unites Afghanistan
27 May 1863
•\\xa0Anglo-Afghan Agreement
26 May 1879
•\\xa0Independence
19 August 1919
•\\xa0Kingdom
9 June 1926
•\\xa0Republic
17 July 1973
•\\xa0Democratic Republic
27–28 April 1978
•\\xa0Islamic State
28 April 1992
•\\xa0Islamic Emirate
27 September 1996
•\\xa0Islamic Republic
26 January 2004
•\\xa0Restoration of Islamic Emirate
15 August 2021\\nArea
•\\xa0Total
652,867[19]\\xa0km2 (252,073\\xa0sq\\xa0mi) (40th)
•\\xa0Water\\xa0(%)
negligiblePopulation
•\\xa02021 estimate
40,218,234[7] (37th)
•\\xa0Density
48.08/km2 (124.5/sq\\xa0mi) (174th)GDP\\xa0(PPP)2018\\xa0estimate
•\\xa0Total
$72.911\\xa0billion[20] (96th)
•\\xa0Per capita
$2,024[20] (169th)GDP\\xa0(nominal)2018\\xa0estimate
•\\xa0Total
$21.657\\xa0billion[20] (111st)
•\\xa0Per capita
$493[20] (177th)HDI\\xa0(2019)\"Increase\"\\xa00.511[21]
low\\xa0·\\xa0169thCurrencyAfghani (افغانی) (AFN)Time zoneUTC+4:30
Solar Calendar
(D†)Driving siderightCalling code+93ISO 3166 codeAFInternet TLD.af
افغانستان.'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "type(df.iloc[0].country_html)\n", - "content = \"\".join(df.iloc[0].country_html)\n", - "content" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "97c1e41f-30f3-4116-aa11-5797e05b95ba", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.12" - }, - "toc-autonumbering": true, - "toc-showcode": false - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/playground/downloaded_data_inspection_lab/capital_xpath.ipynb b/playground/downloaded_data_inspection_lab/capital_xpath.ipynb new file mode 100644 index 0000000..4bed7e5 --- /dev/null +++ b/playground/downloaded_data_inspection_lab/capital_xpath.ipynb @@ -0,0 +1,2149 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a1e7252d-16d9-4b68-a855-d94a89132291", + "metadata": {}, + "source": [ + "# Capital extraction" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e4bb164b-f8a7-4d21-86a0-618ca78e9386", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T20:44:25.083012Z", + "iopub.status.busy": "2022-06-25T20:44:25.082584Z", + "iopub.status.idle": "2022-06-25T20:44:25.400728Z", + "shell.execute_reply": "2022-06-25T20:44:25.399967Z", + "shell.execute_reply.started": "2022-06-25T20:44:25.082926Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import json\n", + "import pathlib\n", + "import re\n", + "import xml.etree.ElementTree as ET\n", + "\n", + "import pandas as pd\n", + "from lxml import etree" + ] + }, + { + "cell_type": "markdown", + "id": "9fd69c5d-2c4d-49c8-a042-82eade1d6ab7", + "metadata": {}, + "source": [ + "## load data\n", + "\n", + "### load the raw countries data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bfe405bb-7879-4f5f-85df-7215c5e8a4b8", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T20:44:25.402295Z", + "iopub.status.busy": "2022-06-25T20:44:25.401710Z", + "iopub.status.idle": "2022-06-25T20:44:25.406212Z", + "shell.execute_reply": "2022-06-25T20:44:25.405622Z", + "shell.execute_reply.started": "2022-06-25T20:44:25.402274Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "data_directory = (\n", + " pathlib.Path(\".\").resolve().parents[1] / \"data\" / \"scrapy\" / \"raw_country_data\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "49fdb7c8-b9db-4d0a-8d2b-d4a0a3ddcdd9", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T20:45:11.573098Z", + "iopub.status.busy": "2022-06-25T20:45:11.572720Z", + "iopub.status.idle": "2022-06-25T20:45:11.650803Z", + "shell.execute_reply": "2022-06-25T20:45:11.650139Z", + "shell.execute_reply.started": "2022-06-25T20:45:11.573067Z" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
0https://en.wikipedia.org/wiki/Afghanistanhttps://en.wikipedia.org/wiki/Flag_of_AfghanistanAfghanistan[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
1https://en.wikipedia.org/wiki/Croatiahttps://en.wikipedia.org/wiki/Flag_of_CroatiaCroatia[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
2https://en.wikipedia.org/wiki/Costa_Ricahttps://en.wikipedia.org/wiki/Flag_of_Costa_RicaCosta_Rica[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <b><a href=\"/wiki/National_flag\" title=...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
3https://en.wikipedia.org/wiki/Democratic_Repub...https://en.wikipedia.org/wiki/Flag_of_the_Demo...Democratic_Republic_of_the_Congo[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
4https://en.wikipedia.org/wiki/Comoroshttps://en.wikipedia.org/wiki/Flag_of_ComorosComoros[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <b><a href=\"/wiki/National_flag\" title=...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
........................
201https://en.wikipedia.org/wiki/Antigua_and_Barbudahttps://en.wikipedia.org/wiki/Flag_of_Antigua_...Antigua_and_Barbuda[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
202https://en.wikipedia.org/wiki/Angolahttps://en.wikipedia.org/wiki/Flag_of_AngolaAngola[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
203https://en.wikipedia.org/wiki/Andorrahttps://en.wikipedia.org/wiki/Flag_of_AndorraAndorra[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
204https://en.wikipedia.org/wiki/Algeriahttps://en.wikipedia.org/wiki/Flag_of_AlgeriaAlgeria[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
205https://en.wikipedia.org/wiki/Albaniahttps://en.wikipedia.org/wiki/Flag_of_AlbaniaAlbania[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <b>flag of Albania</b> (<a href=\"/wiki/...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
\n", + "

206 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " country_url \\\n", + "0 https://en.wikipedia.org/wiki/Afghanistan \n", + "1 https://en.wikipedia.org/wiki/Croatia \n", + "2 https://en.wikipedia.org/wiki/Costa_Rica \n", + "3 https://en.wikipedia.org/wiki/Democratic_Repub... \n", + "4 https://en.wikipedia.org/wiki/Comoros \n", + ".. ... \n", + "201 https://en.wikipedia.org/wiki/Antigua_and_Barbuda \n", + "202 https://en.wikipedia.org/wiki/Angola \n", + "203 https://en.wikipedia.org/wiki/Andorra \n", + "204 https://en.wikipedia.org/wiki/Algeria \n", + "205 https://en.wikipedia.org/wiki/Albania \n", + "\n", + " flag_description_url \\\n", + "0 https://en.wikipedia.org/wiki/Flag_of_Afghanistan \n", + "1 https://en.wikipedia.org/wiki/Flag_of_Croatia \n", + "2 https://en.wikipedia.org/wiki/Flag_of_Costa_Rica \n", + "3 https://en.wikipedia.org/wiki/Flag_of_the_Demo... \n", + "4 https://en.wikipedia.org/wiki/Flag_of_Comoros \n", + ".. ... \n", + "201 https://en.wikipedia.org/wiki/Flag_of_Antigua_... \n", + "202 https://en.wikipedia.org/wiki/Flag_of_Angola \n", + "203 https://en.wikipedia.org/wiki/Flag_of_Andorra \n", + "204 https://en.wikipedia.org/wiki/Flag_of_Algeria \n", + "205 https://en.wikipedia.org/wiki/Flag_of_Albania \n", + "\n", + " short_country_name \\\n", + "0 Afghanistan \n", + "1 Croatia \n", + "2 Costa_Rica \n", + "3 Democratic_Republic_of_the_Congo \n", + "4 Comoros \n", + ".. ... \n", + "201 Antigua_and_Barbuda \n", + "202 Angola \n", + "203 Andorra \n", + "204 Algeria \n", + "205 Albania \n", + "\n", + " country_html \\\n", + "0 [The The The The The The The The The The flag of Albania (\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
0https://en.wikipedia.org/wiki/Afghanistanhttps://en.wikipedia.org/wiki/Flag_of_AfghanistanAfghanistan[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
1https://en.wikipedia.org/wiki/Croatiahttps://en.wikipedia.org/wiki/Flag_of_CroatiaCroatia[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
2https://en.wikipedia.org/wiki/Costa_Ricahttps://en.wikipedia.org/wiki/Flag_of_Costa_RicaCosta Rica[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <b><a href=\"/wiki/National_flag\" title=...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
3https://en.wikipedia.org/wiki/Democratic_Repub...https://en.wikipedia.org/wiki/Flag_of_the_Demo...Democratic Republic of the Congo[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
4https://en.wikipedia.org/wiki/Comoroshttps://en.wikipedia.org/wiki/Flag_of_ComorosComoros[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <b><a href=\"/wiki/National_flag\" title=...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
........................
201https://en.wikipedia.org/wiki/Antigua_and_Barbudahttps://en.wikipedia.org/wiki/Flag_of_Antigua_...Antigua and Barbuda[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
202https://en.wikipedia.org/wiki/Angolahttps://en.wikipedia.org/wiki/Flag_of_AngolaAngola[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
203https://en.wikipedia.org/wiki/Andorrahttps://en.wikipedia.org/wiki/Flag_of_AndorraAndorra[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
204https://en.wikipedia.org/wiki/Algeriahttps://en.wikipedia.org/wiki/Flag_of_AlgeriaAlgeria[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
205https://en.wikipedia.org/wiki/Albaniahttps://en.wikipedia.org/wiki/Flag_of_AlbaniaAlbania[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <b>flag of Albania</b> (<a href=\"/wiki/...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
\n", + "

206 rows × 7 columns

\n", + "" + ], + "text/plain": [ + " country_url \\\n", + "0 https://en.wikipedia.org/wiki/Afghanistan \n", + "1 https://en.wikipedia.org/wiki/Croatia \n", + "2 https://en.wikipedia.org/wiki/Costa_Rica \n", + "3 https://en.wikipedia.org/wiki/Democratic_Repub... \n", + "4 https://en.wikipedia.org/wiki/Comoros \n", + ".. ... \n", + "201 https://en.wikipedia.org/wiki/Antigua_and_Barbuda \n", + "202 https://en.wikipedia.org/wiki/Angola \n", + "203 https://en.wikipedia.org/wiki/Andorra \n", + "204 https://en.wikipedia.org/wiki/Algeria \n", + "205 https://en.wikipedia.org/wiki/Albania \n", + "\n", + " flag_description_url \\\n", + "0 https://en.wikipedia.org/wiki/Flag_of_Afghanistan \n", + "1 https://en.wikipedia.org/wiki/Flag_of_Croatia \n", + "2 https://en.wikipedia.org/wiki/Flag_of_Costa_Rica \n", + "3 https://en.wikipedia.org/wiki/Flag_of_the_Demo... \n", + "4 https://en.wikipedia.org/wiki/Flag_of_Comoros \n", + ".. ... \n", + "201 https://en.wikipedia.org/wiki/Flag_of_Antigua_... \n", + "202 https://en.wikipedia.org/wiki/Flag_of_Angola \n", + "203 https://en.wikipedia.org/wiki/Flag_of_Andorra \n", + "204 https://en.wikipedia.org/wiki/Flag_of_Algeria \n", + "205 https://en.wikipedia.org/wiki/Flag_of_Albania \n", + "\n", + " short_country_name \\\n", + "0 Afghanistan \n", + "1 Croatia \n", + "2 Costa Rica \n", + "3 Democratic Republic of the Congo \n", + "4 Comoros \n", + ".. ... \n", + "201 Antigua and Barbuda \n", + "202 Angola \n", + "203 Andorra \n", + "204 Algeria \n", + "205 Albania \n", + "\n", + " country_html \\\n", + "0 [The
The The The The The The The The The flag of Albania (\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
0https://en.wikipedia.org/wiki/Afghanistanhttps://en.wikipedia.org/wiki/Flag_of_AfghanistanAfghanistan<tr><th colspan=\"2\" class=\"infobox-above adr\">...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
1https://en.wikipedia.org/wiki/Croatiahttps://en.wikipedia.org/wiki/Flag_of_CroatiaCroatia<tr><th colspan=\"2\" class=\"infobox-above adr\">...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
2https://en.wikipedia.org/wiki/Costa_Ricahttps://en.wikipedia.org/wiki/Flag_of_Costa_RicaCosta Rica<tr><th colspan=\"2\" class=\"infobox-above adr\">...<p>The <b><a href=\"/wiki/National_flag\" title=...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
3https://en.wikipedia.org/wiki/Democratic_Repub...https://en.wikipedia.org/wiki/Flag_of_the_Demo...Democratic Republic of the Congo<tr><th colspan=\"2\" class=\"infobox-above adr\">...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
4https://en.wikipedia.org/wiki/Comoroshttps://en.wikipedia.org/wiki/Flag_of_ComorosComoros<tr><th colspan=\"2\" class=\"infobox-above adr\">...<p>The <b><a href=\"/wiki/National_flag\" title=...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
........................
201https://en.wikipedia.org/wiki/Antigua_and_Barbudahttps://en.wikipedia.org/wiki/Flag_of_Antigua_...Antigua and Barbuda<tr><th colspan=\"2\" class=\"infobox-above adr\">...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
202https://en.wikipedia.org/wiki/Angolahttps://en.wikipedia.org/wiki/Flag_of_AngolaAngola<tr><th colspan=\"2\" class=\"infobox-above adr\">...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
203https://en.wikipedia.org/wiki/Andorrahttps://en.wikipedia.org/wiki/Flag_of_AndorraAndorra<tr><th colspan=\"2\" class=\"infobox-above adr\">...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
204https://en.wikipedia.org/wiki/Algeriahttps://en.wikipedia.org/wiki/Flag_of_AlgeriaAlgeria<tr><th colspan=\"2\" class=\"infobox-above adr\">...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
205https://en.wikipedia.org/wiki/Albaniahttps://en.wikipedia.org/wiki/Flag_of_AlbaniaAlbania<tr><th colspan=\"2\" class=\"infobox-above adr\">...<p>The <b>flag of Albania</b> (<a href=\"/wiki/...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
\n", + "

206 rows × 7 columns

\n", + "" + ], + "text/plain": [ + " country_url \\\n", + "0 https://en.wikipedia.org/wiki/Afghanistan \n", + "1 https://en.wikipedia.org/wiki/Croatia \n", + "2 https://en.wikipedia.org/wiki/Costa_Rica \n", + "3 https://en.wikipedia.org/wiki/Democratic_Repub... \n", + "4 https://en.wikipedia.org/wiki/Comoros \n", + ".. ... \n", + "201 https://en.wikipedia.org/wiki/Antigua_and_Barbuda \n", + "202 https://en.wikipedia.org/wiki/Angola \n", + "203 https://en.wikipedia.org/wiki/Andorra \n", + "204 https://en.wikipedia.org/wiki/Algeria \n", + "205 https://en.wikipedia.org/wiki/Albania \n", + "\n", + " flag_description_url \\\n", + "0 https://en.wikipedia.org/wiki/Flag_of_Afghanistan \n", + "1 https://en.wikipedia.org/wiki/Flag_of_Croatia \n", + "2 https://en.wikipedia.org/wiki/Flag_of_Costa_Rica \n", + "3 https://en.wikipedia.org/wiki/Flag_of_the_Demo... \n", + "4 https://en.wikipedia.org/wiki/Flag_of_Comoros \n", + ".. ... \n", + "201 https://en.wikipedia.org/wiki/Flag_of_Antigua_... \n", + "202 https://en.wikipedia.org/wiki/Flag_of_Angola \n", + "203 https://en.wikipedia.org/wiki/Flag_of_Andorra \n", + "204 https://en.wikipedia.org/wiki/Flag_of_Algeria \n", + "205 https://en.wikipedia.org/wiki/Flag_of_Albania \n", + "\n", + " short_country_name \\\n", + "0 Afghanistan \n", + "1 Croatia \n", + "2 Costa Rica \n", + "3 Democratic Republic of the Congo \n", + "4 Comoros \n", + ".. ... \n", + "201 Antigua and Barbuda \n", + "202 Angola \n", + "203 Andorra \n", + "204 Algeria \n", + "205 Albania \n", + "\n", + " country_html \\\n", + "0 ... \n", + "1 ... \n", + "2 ... \n", + "3 ... \n", + "4 ... \n", + ".. ... \n", + "201 ... \n", + "202 ... \n", + "203 ... \n", + "204 ... \n", + "205 ... \n", + "\n", + " flag_html \\\n", + "0

The The The The The The The The The The flag of Albania (` tags" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "ab6166f9-daa2-4591-9989-8d33f3f98533", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T20:45:51.019933Z", + "iopub.status.busy": "2022-06-25T20:45:51.019577Z", + "iopub.status.idle": "2022-06-25T20:45:51.041851Z", + "shell.execute_reply": "2022-06-25T20:45:51.041008Z", + "shell.execute_reply.started": "2022-06-25T20:45:51.019903Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "countries[\"country_html\"] = countries[\"country_html\"].map(\n", + " lambda html: html.replace(\"
\", \"\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ce0e413d-1763-4520-bc36-29715963718c", + "metadata": {}, + "source": [ + "### add root node" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "df924f80-c239-4a35-bc30-888589b34f0b", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T20:47:29.097287Z", + "iopub.status.busy": "2022-06-25T20:47:29.096936Z", + "iopub.status.idle": "2022-06-25T20:47:29.109510Z", + "shell.execute_reply": "2022-06-25T20:47:29.108689Z", + "shell.execute_reply.started": "2022-06-25T20:47:29.097258Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "countries[\"country_html\"] = countries[\"country_html\"].map(\n", + " lambda html: f\"

\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f5e7249e-988c-420e-bd02-f47d96fd0685", + "metadata": {}, + "source": [ + "## parse\n", + "\n", + "### parse string as xml" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "691bd250-44b9-4adb-9430-b0ef624c986b", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T20:47:50.180864Z", + "iopub.status.busy": "2022-06-25T20:47:50.180516Z", + "iopub.status.idle": "2022-06-25T20:47:50.184798Z", + "shell.execute_reply": "2022-06-25T20:47:50.183956Z", + "shell.execute_reply.started": "2022-06-25T20:47:50.180835Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "parser = etree.XMLParser(recover=True)" + ] + }, + { + "cell_type": "markdown", + "id": "9967cd92-390a-4b1e-a946-32d46a898eb9", + "metadata": {}, + "source": [ + "#### Afganistan" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "915c338e-9225-46a3-b833-89fdeb971c88", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T20:47:51.179025Z", + "iopub.status.busy": "2022-06-25T20:47:51.178417Z", + "iopub.status.idle": "2022-06-25T20:47:51.187219Z", + "shell.execute_reply": "2022-06-25T20:47:51.186016Z", + "shell.execute_reply.started": "2022-06-25T20:47:51.178977Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# root = etree.fromstringlist(countries[\"country_html\"].iloc[0], parser)\n", + "root = etree.fromstring(countries[\"country_html\"].iloc[0], parser)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "771453ab-2b00-42e5-a026-3bcea7fc6476", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T20:47:52.012467Z", + "iopub.status.busy": "2022-06-25T20:47:52.011430Z", + "iopub.status.idle": "2022-06-25T20:47:52.020811Z", + "shell.execute_reply": "2022-06-25T20:47:52.019658Z", + "shell.execute_reply.started": "2022-06-25T20:47:52.012399Z" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "root" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "516153ae-95b8-44b0-8bd3-51914ba95f4c", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T20:47:52.947860Z", + "iopub.status.busy": "2022-06-25T20:47:52.947172Z", + "iopub.status.idle": "2022-06-25T20:47:52.955283Z", + "shell.execute_reply": "2022-06-25T20:47:52.954038Z", + "shell.execute_reply.started": "2022-06-25T20:47:52.947809Z" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "b'Kabul'\n", + "Kabul\n", + "\n" + ] + } + ], + "source": [ + "for element in root.xpath(\"//th[text() = 'Capital']/following-sibling::td/a\"):\n", + " print(etree.tostring(element))\n", + " print(element.text)\n", + " print(element)" + ] + }, + { + "cell_type": "markdown", + "id": "a9d0a316-fa1b-4b04-9090-ab90f72dee19", + "metadata": {}, + "source": [ + "#### South Africa" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "769b4492-28a0-4454-8e92-806c4e646660", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T20:47:58.319337Z", + "iopub.status.busy": "2022-06-25T20:47:58.317420Z", + "iopub.status.idle": "2022-06-25T20:47:58.339980Z", + "shell.execute_reply": "2022-06-25T20:47:58.339214Z", + "shell.execute_reply.started": "2022-06-25T20:47:58.319276Z" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
9https://en.wikipedia.org/wiki/Central_African_...https://en.wikipedia.org/wiki/Flag_of_the_Cent...Central African Republic<div><tr><th colspan=\"2\" class=\"infobox-above ...<p>The <b>flag of the <a href=\"/wiki/Central_A...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
58https://en.wikipedia.org/wiki/South_Africahttps://en.wikipedia.org/wiki/Flag_of_South_Af...South Africa<div><tr><th colspan=\"2\" class=\"infobox-above ...<p>The <b>flag of South Africa</b> was designe...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
\n", + "
" + ], + "text/plain": [ + " country_url \\\n", + "9 https://en.wikipedia.org/wiki/Central_African_... \n", + "58 https://en.wikipedia.org/wiki/South_Africa \n", + "\n", + " flag_description_url \\\n", + "9 https://en.wikipedia.org/wiki/Flag_of_the_Cent... \n", + "58 https://en.wikipedia.org/wiki/Flag_of_South_Af... \n", + "\n", + " short_country_name \\\n", + "9 Central African Republic \n", + "58 South Africa \n", + "\n", + " country_html \\\n", + "9
The flag of the The flag of South Africa was designe... \n", + "\n", + " file_urls \\\n", + "9 [https:////upload.wikimedia.org/wikipedia/comm... \n", + "58 [https:////upload.wikimedia.org/wikipedia/comm... \n", + "\n", + " files \n", + "9 [{'url': 'https://upload.wikimedia.org/wikiped... \n", + "58 [{'url': 'https://upload.wikimedia.org/wikiped... " + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "countries[countries[\"short_country_name\"].map(lambda country: \"Africa\" in country)]" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "f06833fe-dee5-42dc-b0ad-71f9759e0fb1", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T21:21:03.803935Z", + "iopub.status.busy": "2022-06-25T21:21:03.803558Z", + "iopub.status.idle": "2022-06-25T21:21:03.809310Z", + "shell.execute_reply": "2022-06-25T21:21:03.808433Z", + "shell.execute_reply.started": "2022-06-25T21:21:03.803906Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# root = etree.fromstringlist(countries[\"country_html\"].iloc[58], parser)\n", + "root = etree.fromstring(countries[\"country_html\"].iloc[58], parser)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "70f25f0c-92bb-43ee-9223-15d1ad9fb691", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T21:22:53.884677Z", + "iopub.status.busy": "2022-06-25T21:22:53.884343Z", + "iopub.status.idle": "2022-06-25T21:22:53.893752Z", + "shell.execute_reply": "2022-06-25T21:22:53.893051Z", + "shell.execute_reply.started": "2022-06-25T21:22:53.884650Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "for element in root.xpath(\"//th[text() = 'Capital']/following-sibling::td\"):\n", + " for capital in element:\n", + " capital_filter = capital.xpath(\"ul//a\")\n", + " branch_filter = capital.xpath(\"ul//li\")\n", + "\n", + " _capitals = []\n", + " for item in capital_filter:\n", + " if not isinstance(re.search(r\"\\d\", item.text), re.Match):\n", + " _capitals.append(item.text)\n", + " for match in branch_filter:\n", + " branch = match.xpath(\"text()\")\n", + " _capitals.append(re.search(r\"(?:\\()([^\\)]*)\", branch[0])[1])\n", + "\n", + " result = {\n", + " \"index\": 58,\n", + " \"country_name\": countries[\"short_country_name\"].iloc[58],\n", + " \"capital\": _capitals,\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "4d6fffaa-c08f-4ae9-b1d4-d1d27550a903", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T21:22:57.761492Z", + "iopub.status.busy": "2022-06-25T21:22:57.761115Z", + "iopub.status.idle": "2022-06-25T21:22:57.767381Z", + "shell.execute_reply": "2022-06-25T21:22:57.766523Z", + "shell.execute_reply.started": "2022-06-25T21:22:57.761462Z" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'index': 58,\n", + " 'country_name': 'South Africa',\n", + " 'capital': ['Pretoria',\n", + " 'Cape Town',\n", + " 'Bloemfontein',\n", + " 'executive',\n", + " 'legislative',\n", + " 'judicial']}" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result" + ] + }, + { + "cell_type": "markdown", + "id": "8af8a656-3e07-4de0-933e-012d218c18f9", + "metadata": {}, + "source": [ + "#### Albania" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "33205ffb-1b50-468b-8d5b-3c7302a8b285", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T20:48:07.893538Z", + "iopub.status.busy": "2022-06-25T20:48:07.892909Z", + "iopub.status.idle": "2022-06-25T20:48:07.905915Z", + "shell.execute_reply": "2022-06-25T20:48:07.905138Z", + "shell.execute_reply.started": "2022-06-25T20:48:07.893507Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
205https://en.wikipedia.org/wiki/Albaniahttps://en.wikipedia.org/wiki/Flag_of_AlbaniaAlbania<div><tr><th colspan=\"2\" class=\"infobox-above ...<p>The <b>flag of Albania</b> (<a href=\"/wiki/...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
\n", + "
" + ], + "text/plain": [ + " country_url \\\n", + "205 https://en.wikipedia.org/wiki/Albania \n", + "\n", + " flag_description_url short_country_name \\\n", + "205 https://en.wikipedia.org/wiki/Flag_of_Albania Albania \n", + "\n", + " country_html \\\n", + "205
The flag of Albania (\n" + ] + } + ], + "source": [ + "for element in root.xpath(\"//th[text() = 'Capital']/following-sibling::td\"):\n", + " print(element)" + ] + }, + { + "cell_type": "markdown", + "id": "5c316764-016f-49be-a3ad-1485dceb5b0c", + "metadata": {}, + "source": [ + "#### Sahrawi Arab Democratic Republic" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "3dfd7e29-6166-483a-b454-09daadf2ea20", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T21:26:21.907262Z", + "iopub.status.busy": "2022-06-25T21:26:21.906430Z", + "iopub.status.idle": "2022-06-25T21:26:21.924749Z", + "shell.execute_reply": "2022-06-25T21:26:21.923775Z", + "shell.execute_reply.started": "2022-06-25T21:26:21.907228Z" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
20https://en.wikipedia.org/wiki/Sahrawi_Arab_Dem...https://en.wikipedia.org/wiki/Flag_of_Sahrawi_...Sahrawi Arab Democratic Republic<div><tr><th colspan=\"2\" class=\"infobox-above ...<p>The <b>flag of Western Sahara</b> (Arabic: ...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
39https://en.wikipedia.org/wiki/United_Arab_Emir...https://en.wikipedia.org/wiki/Flag_of_the_Unit...United Arab Emirates<div><tr><th colspan=\"2\" class=\"infobox-above ...<p>The <a href=\"/wiki/National_flag\" title=\"Na...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
69https://en.wikipedia.org/wiki/Saudi_Arabiahttps://en.wikipedia.org/wiki/Flag_of_Saudi_Ar...Saudi Arabia<div><tr><th colspan=\"2\" class=\"infobox-above ...<p>The <b>flag of the Kingdom of Saudi Arabia<...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
\n", + "
" + ], + "text/plain": [ + " country_url \\\n", + "20 https://en.wikipedia.org/wiki/Sahrawi_Arab_Dem... \n", + "39 https://en.wikipedia.org/wiki/United_Arab_Emir... \n", + "69 https://en.wikipedia.org/wiki/Saudi_Arabia \n", + "\n", + " flag_description_url \\\n", + "20 https://en.wikipedia.org/wiki/Flag_of_Sahrawi_... \n", + "39 https://en.wikipedia.org/wiki/Flag_of_the_Unit... \n", + "69 https://en.wikipedia.org/wiki/Flag_of_Saudi_Ar... \n", + "\n", + " short_country_name \\\n", + "20 Sahrawi Arab Democratic Republic \n", + "39 United Arab Emirates \n", + "69 Saudi Arabia \n", + "\n", + " country_html \\\n", + "20
The flag of Western Sahara (Arabic: ... \n", + "39

The The flag of the Kingdom of Saudi Arabia<... \n", + "\n", + " file_urls \\\n", + "20 [https:////upload.wikimedia.org/wikipedia/comm... \n", + "39 [https:////upload.wikimedia.org/wikipedia/comm... \n", + "69 [https:////upload.wikimedia.org/wikipedia/comm... \n", + "\n", + " files \n", + "20 [{'url': 'https://upload.wikimedia.org/wikiped... \n", + "39 [{'url': 'https://upload.wikimedia.org/wikiped... \n", + "69 [{'url': 'https://upload.wikimedia.org/wikiped... " + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "countries[countries[\"short_country_name\"].map(lambda country: \"arab\" in country.lower())]" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "02e1f53a-9081-4edc-9a61-41da56166274", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T21:26:26.463033Z", + "iopub.status.busy": "2022-06-25T21:26:26.462631Z", + "iopub.status.idle": "2022-06-25T21:26:26.469880Z", + "shell.execute_reply": "2022-06-25T21:26:26.469125Z", + "shell.execute_reply.started": "2022-06-25T21:26:26.463004Z" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "country_url https://en.wikipedia.org/wiki/Sahrawi_Arab_Dem...\n", + "flag_description_url https://en.wikipedia.org/wiki/Flag_of_Sahrawi_...\n", + "short_country_name Sahrawi Arab Democratic Republic\n", + "country_html

The flag of Western Sahara (Arabic: ...\n", + "file_urls [https:////upload.wikimedia.org/wikipedia/comm...\n", + "files [{'url': 'https://upload.wikimedia.org/wikiped...\n", + "Name: 20, dtype: object" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "countries.iloc[20]" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "3b74a848-af64-4ea8-b2b2-3b78c9fb843e", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T21:26:39.089757Z", + "iopub.status.busy": "2022-06-25T21:26:39.089324Z", + "iopub.status.idle": "2022-06-25T21:26:39.095857Z", + "shell.execute_reply": "2022-06-25T21:26:39.094902Z", + "shell.execute_reply.started": "2022-06-25T21:26:39.089723Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "root = etree.fromstring(countries[\"country_html\"].iloc[20], parser)" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "id": "f8829405-0533-4349-9c99-05e9e1e41607", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T21:49:34.812182Z", + "iopub.status.busy": "2022-06-25T21:49:34.811795Z", + "iopub.status.idle": "2022-06-25T21:49:34.820550Z", + "shell.execute_reply": "2022-06-25T21:49:34.819761Z", + "shell.execute_reply.started": "2022-06-25T21:49:34.812153Z" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "b'
'\n", + "b'de jure)'\n", + "match de jure\n", + "b'de facto)'\n", + "match de facto\n" + ] + } + ], + "source": [ + "for element in root.xpath(\"//th[text() = 'Capital']/following-sibling::td\"):\n", + " for capital in element:\n", + " print(etree.tostring(capital))\n", + " capital_filter = capital.xpath(\"ul//a\")\n", + " branch_filter = capital.xpath(\"ul//li//i\")\n", + "\n", + " _capitals = []\n", + " for item in capital_filter:\n", + " if not isinstance(re.search(r\"\\d\", item.text), re.Match):\n", + " _capitals.append(item.text)\n", + " for match in branch_filter:\n", + " print(etree.tostring(match))\n", + " print(\"match\", match.text)\n", + " if isinstance(re.search(r\"(?:i\\>)([^\\<)]*)\", etree.tostring(match).decode(\"utf-8\")), re.Match):\n", + " _capitals.append(match.text)\n", + " result = {\n", + " \"index\": 20,\n", + " \"country_name\": countries[\"short_country_name\"].iloc[20],\n", + " \"capital\": _capitals,\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "id": "92d7c4ac-44f9-46e6-b8ae-94dffa276f84", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T21:49:35.379470Z", + "iopub.status.busy": "2022-06-25T21:49:35.378687Z", + "iopub.status.idle": "2022-06-25T21:49:35.388295Z", + "shell.execute_reply": "2022-06-25T21:49:35.386602Z", + "shell.execute_reply.started": "2022-06-25T21:49:35.379399Z" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'index': 20,\n", + " 'country_name': 'Sahrawi Arab Democratic Republic',\n", + " 'capital': ['El Aaiún', 'Tifariti', 'de jure', 'de facto']}" + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result" + ] + }, + { + "cell_type": "markdown", + "id": "9d909d88-e086-41fe-a46f-1cd0a21ec625", + "metadata": {}, + "source": [ + "## define functions" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "id": "9263180c-92ed-4f5f-a479-70af90599b75", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T21:51:58.514700Z", + "iopub.status.busy": "2022-06-25T21:51:58.514295Z", + "iopub.status.idle": "2022-06-25T21:51:58.524855Z", + "shell.execute_reply": "2022-06-25T21:51:58.524175Z", + "shell.execute_reply.started": "2022-06-25T21:51:58.514671Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def extract_capital_0(index: int, country_name: str, country_html: str):\n", + " result = None\n", + " root = etree.fromstring(country_html, parser)\n", + " \n", + " # matches single capital\n", + " for element in root.xpath(\"//th[text() = 'Capital']/following-sibling::td/a\"):\n", + " result = {\"index\": index, \"country_name\": country_name, \"capital\": element.text}\n", + "\n", + " # matches multiple capitals with numbered footnote\n", + " try:\n", + " if result is None:\n", + " for element in root.xpath(\"//th[text() = 'Capital']/following-sibling::td\"):\n", + " for capital in element:\n", + " capital_filter = capital.xpath(\"ul//a\")\n", + " branch_filter = capital.xpath(\"ul//li\")\n", + "\n", + " _capitals = []\n", + " for item in capital_filter:\n", + " if not isinstance(re.search(r\"\\d\", item.text), re.Match):\n", + " _capitals.append(item.text)\n", + " for match in branch_filter:\n", + " branch = match.xpath(\"text()\")\n", + " _capitals.append(re.search(r\"(?:\\()([^\\)]*)\", branch[0])[1])\n", + " result = {\n", + " \"index\": index,\n", + " \"country_name\": country_name,\n", + " \"capital\": _capitals,\n", + " }\n", + " except IndexError:\n", + " result = None\n", + " \n", + " # matches mutiple capitals with italic footnote\n", + " if result is None:\n", + " for element in root.xpath(\"//th[text() = 'Capital']/following-sibling::td\"):\n", + " for capital in element:\n", + " capital_filter = capital.xpath(\"ul//a\")\n", + " branch_filter = capital.xpath(\"ul//li//i\")\n", + "\n", + " _capitals = []\n", + " for item in capital_filter:\n", + " if not isinstance(re.search(r\"\\d\", item.text), re.Match):\n", + " _capitals.append(item.text)\n", + " for match in branch_filter:\n", + " if isinstance(re.search(r\"(?:i\\>)([^\\<)]*)\", etree.tostring(match).decode(\"utf-8\")), re.Match):\n", + " _capitals.append(match.text)\n", + " result = {\n", + " \"index\": 20,\n", + " \"country_name\": countries[\"short_country_name\"].iloc[20],\n", + " \"capital\": _capitals,\n", + " }\n", + "\n", + " return result or None" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "id": "0b39d1c7-a1ee-4031-a0eb-93ab5ef525ff", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T22:10:22.757906Z", + "iopub.status.busy": "2022-06-25T22:10:22.757413Z", + "iopub.status.idle": "2022-06-25T22:10:22.881155Z", + "shell.execute_reply": "2022-06-25T22:10:22.880194Z", + "shell.execute_reply.started": "2022-06-25T22:10:22.757877Z" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'index': 0, 'country_name': 'Afghanistan', 'capital': None}\n", + "{'index': 1, 'country_name': 'Croatia', 'capital': None}\n", + "{'index': 2, 'country_name': 'Costa Rica', 'capital': None}\n", + "{'index': 3, 'country_name': 'Democratic Republic of the Congo', 'capital': None}\n", + "{'index': 4, 'country_name': 'Comoros', 'capital': None}\n", + "{'index': 5, 'country_name': 'Republic of the Congo', 'capital': None}\n", + "{'index': 6, 'country_name': 'China', 'capital': None}\n", + "{'index': 7, 'country_name': 'Chile', 'capital': None}\n", + "{'index': 8, 'country_name': 'Chad', 'capital': None}\n", + "{'index': 9, 'country_name': 'Central African Republic', 'capital': None}\n", + "{'index': 10, 'country_name': 'Cape Verde', 'capital': None}\n", + "{'index': 11, 'country_name': 'Colombia', 'capital': None}\n", + "{'index': 12, 'country_name': 'Cameroon', 'capital': None}\n", + "{'index': 13, 'country_name': 'Cambodia', 'capital': None}\n", + "{'index': 14, 'country_name': 'Burundi', 'capital': None}\n", + "{'index': 15, 'country_name': 'Transnistria', 'capital': None}\n", + "{'index': 16, 'country_name': 'Canada', 'capital': None}\n", + "{'index': 17, 'country_name': 'Taiwan', 'capital': None}\n", + "{'index': 18, 'country_name': 'South Ossetia', 'capital': None}\n", + "{'index': 19, 'country_name': 'Somaliland', 'capital': None}\n", + "{'index': 20, 'country_name': 'Sahrawi Arab Democratic Republic', 'capital': None}\n", + "{'index': 21, 'country_name': 'Northern Cyprus', 'capital': None}\n", + "{'index': 22, 'country_name': 'Niue', 'capital': None}\n", + "{'index': 23, 'country_name': 'Kosovo', 'capital': None}\n", + "{'index': 24, 'country_name': 'Cook Islands', 'capital': None}\n", + "{'index': 25, 'country_name': 'Republic of Artsakh', 'capital': None}\n", + "{'index': 26, 'country_name': 'Abkhazia', 'capital': None}\n", + "{'index': 27, 'country_name': 'Zimbabwe', 'capital': None}\n", + "{'index': 28, 'country_name': 'Zambia', 'capital': None}\n", + "{'index': 29, 'country_name': 'Yemen', 'capital': None}\n", + "{'index': 30, 'country_name': 'Vietnam', 'capital': None}\n", + "{'index': 31, 'country_name': 'Venezuela', 'capital': None}\n", + "{'index': 32, 'country_name': 'Luhansk People%27s Republic', 'capital': None}\n", + "None\n", + "{'index': 34, 'country_name': 'Vanuatu', 'capital': None}\n", + "{'index': 35, 'country_name': 'Uzbekistan', 'capital': None}\n", + "{'index': 36, 'country_name': 'Uruguay', 'capital': None}\n", + "{'index': 37, 'country_name': 'United Kingdom', 'capital': None}\n", + "{'index': 38, 'country_name': 'United States', 'capital': None}\n", + "{'index': 39, 'country_name': 'United Arab Emirates', 'capital': None}\n", + "{'index': 40, 'country_name': 'Uganda', 'capital': None}\n", + "{'index': 41, 'country_name': 'Tuvalu', 'capital': None}\n", + "{'index': 42, 'country_name': 'Tunisia', 'capital': None}\n", + "{'index': 43, 'country_name': 'Turkmenistan', 'capital': None}\n", + "{'index': 44, 'country_name': 'Tonga', 'capital': None}\n", + "{'index': 45, 'country_name': 'Trinidad and Tobago', 'capital': None}\n", + "{'index': 46, 'country_name': 'Togo', 'capital': None}\n", + "{'index': 47, 'country_name': 'Ukraine', 'capital': None}\n", + "{'index': 48, 'country_name': 'Thailand', 'capital': None}\n", + "{'index': 49, 'country_name': 'Tanzania', 'capital': None}\n", + "{'index': 50, 'country_name': 'Tajikistan', 'capital': None}\n", + "{'index': 51, 'country_name': 'Syria', 'capital': None}\n", + "{'index': 52, 'country_name': 'Switzerland', 'capital': None}\n", + "{'index': 53, 'country_name': 'Sweden', 'capital': None}\n", + "{'index': 54, 'country_name': 'Sudan', 'capital': None}\n", + "{'index': 55, 'country_name': 'Suriname', 'capital': None}\n", + "{'index': 56, 'country_name': 'Sri Lanka', 'capital': None}\n", + "{'index': 57, 'country_name': 'Spain', 'capital': None}\n", + "{'index': 58, 'country_name': 'South Africa', 'capital': None}\n", + "{'index': 59, 'country_name': 'Somalia', 'capital': None}\n", + "{'index': 60, 'country_name': 'Solomon Islands', 'capital': None}\n", + "{'index': 61, 'country_name': 'Slovenia', 'capital': None}\n", + "{'index': 62, 'country_name': 'South Sudan', 'capital': None}\n", + "{'index': 63, 'country_name': 'Slovakia', 'capital': None}\n", + "None\n", + "{'index': 65, 'country_name': 'Sierra Leone', 'capital': None}\n", + "{'index': 66, 'country_name': 'Seychelles', 'capital': None}\n", + "{'index': 67, 'country_name': 'Serbia', 'capital': None}\n", + "{'index': 68, 'country_name': 'Senegal', 'capital': None}\n", + "{'index': 69, 'country_name': 'Saudi Arabia', 'capital': None}\n", + "{'index': 70, 'country_name': 'S%C3%A3o Tom%C3%A9 and Pr%C3%ADncipe', 'capital': None}\n", + "{'index': 71, 'country_name': 'San Marino', 'capital': None}\n", + "{'index': 72, 'country_name': 'Samoa', 'capital': None}\n", + "{'index': 73, 'country_name': 'Saint Vincent and the Grenadines', 'capital': None}\n", + "{'index': 74, 'country_name': 'Saint Lucia', 'capital': None}\n", + "{'index': 75, 'country_name': 'Rwanda', 'capital': None}\n", + "{'index': 76, 'country_name': 'Romania', 'capital': None}\n", + "{'index': 77, 'country_name': 'Portugal', 'capital': None}\n", + "{'index': 78, 'country_name': 'Poland', 'capital': None}\n", + "{'index': 79, 'country_name': 'Saint Kitts and Nevis', 'capital': None}\n", + "{'index': 80, 'country_name': 'Philippines', 'capital': None}\n", + "{'index': 81, 'country_name': 'Qatar', 'capital': None}\n", + "{'index': 82, 'country_name': 'Russia', 'capital': None}\n", + "{'index': 83, 'country_name': 'Peru', 'capital': None}\n", + "{'index': 84, 'country_name': 'Papua New Guinea', 'capital': None}\n", + "{'index': 85, 'country_name': 'Paraguay', 'capital': None}\n", + "{'index': 86, 'country_name': 'Panama', 'capital': None}\n", + "None\n", + "{'index': 88, 'country_name': 'Palau', 'capital': None}\n", + "{'index': 89, 'country_name': 'Oman', 'capital': None}\n", + "{'index': 90, 'country_name': 'North Macedonia', 'capital': None}\n", + "{'index': 91, 'country_name': 'Norway', 'capital': None}\n", + "{'index': 92, 'country_name': 'Pakistan', 'capital': None}\n", + "{'index': 93, 'country_name': 'Nigeria', 'capital': None}\n", + "{'index': 94, 'country_name': 'Niger', 'capital': None}\n", + "{'index': 95, 'country_name': 'Nicaragua', 'capital': None}\n", + "{'index': 96, 'country_name': 'Kingdom of the Netherlands', 'capital': None}\n", + "{'index': 97, 'country_name': 'Nepal', 'capital': None}\n", + "{'index': 98, 'country_name': 'Nauru', 'capital': None}\n", + "{'index': 99, 'country_name': 'New Zealand', 'capital': None}\n", + "{'index': 100, 'country_name': 'Namibia', 'capital': None}\n", + "{'index': 101, 'country_name': 'Myanmar', 'capital': None}\n", + "{'index': 102, 'country_name': 'Mozambique', 'capital': None}\n", + "{'index': 103, 'country_name': 'Morocco', 'capital': None}\n", + "{'index': 104, 'country_name': 'Montenegro', 'capital': None}\n", + "{'index': 105, 'country_name': 'Monaco', 'capital': None}\n", + "{'index': 106, 'country_name': 'Moldova', 'capital': None}\n", + "{'index': 107, 'country_name': 'Federated States of Micronesia', 'capital': None}\n", + "{'index': 108, 'country_name': 'Mexico', 'capital': None}\n", + "{'index': 109, 'country_name': 'Mongolia', 'capital': None}\n", + "{'index': 110, 'country_name': 'Mauritius', 'capital': None}\n", + "{'index': 111, 'country_name': 'Mauritania', 'capital': None}\n", + "{'index': 112, 'country_name': 'Marshall Islands', 'capital': None}\n", + "{'index': 113, 'country_name': 'Malta', 'capital': None}\n", + "{'index': 114, 'country_name': 'Mali', 'capital': None}\n", + "{'index': 115, 'country_name': 'Maldives', 'capital': None}\n", + "{'index': 116, 'country_name': 'Malaysia', 'capital': None}\n", + "{'index': 117, 'country_name': 'Malawi', 'capital': None}\n", + "{'index': 118, 'country_name': 'Madagascar', 'capital': None}\n", + "{'index': 119, 'country_name': 'Luxembourg', 'capital': None}\n", + "{'index': 120, 'country_name': 'Lithuania', 'capital': None}\n", + "{'index': 121, 'country_name': 'Liechtenstein', 'capital': None}\n", + "{'index': 122, 'country_name': 'Libya', 'capital': None}\n", + "{'index': 123, 'country_name': 'Liberia', 'capital': None}\n", + "{'index': 124, 'country_name': 'Lesotho', 'capital': None}\n", + "{'index': 125, 'country_name': 'Lebanon', 'capital': None}\n", + "{'index': 126, 'country_name': 'Latvia', 'capital': None}\n", + "{'index': 127, 'country_name': 'Laos', 'capital': None}\n", + "{'index': 128, 'country_name': 'Kyrgyzstan', 'capital': None}\n", + "{'index': 129, 'country_name': 'Kuwait', 'capital': None}\n", + "{'index': 130, 'country_name': 'South Korea', 'capital': None}\n", + "{'index': 131, 'country_name': 'North Korea', 'capital': None}\n", + "{'index': 132, 'country_name': 'Kiribati', 'capital': None}\n", + "{'index': 133, 'country_name': 'Kenya', 'capital': None}\n", + "{'index': 134, 'country_name': 'Kazakhstan', 'capital': None}\n", + "{'index': 135, 'country_name': 'Jordan', 'capital': None}\n", + "{'index': 136, 'country_name': 'Japan', 'capital': None}\n", + "{'index': 137, 'country_name': 'Jamaica', 'capital': None}\n", + "{'index': 138, 'country_name': 'Ivory Coast', 'capital': None}\n", + "{'index': 139, 'country_name': 'Italy', 'capital': None}\n", + "{'index': 140, 'country_name': 'Israel', 'capital': None}\n", + "{'index': 141, 'country_name': 'Republic of Ireland', 'capital': None}\n", + "{'index': 142, 'country_name': 'Iraq', 'capital': None}\n", + "{'index': 143, 'country_name': 'Iran', 'capital': None}\n", + "{'index': 144, 'country_name': 'Indonesia', 'capital': None}\n", + "{'index': 145, 'country_name': 'India', 'capital': None}\n", + "{'index': 146, 'country_name': 'Iceland', 'capital': None}\n", + "{'index': 147, 'country_name': 'Hungary', 'capital': None}\n", + "{'index': 148, 'country_name': 'Honduras', 'capital': None}\n", + "{'index': 149, 'country_name': 'Guyana', 'capital': None}\n", + "{'index': 150, 'country_name': 'Haiti', 'capital': None}\n", + "{'index': 151, 'country_name': 'Guinea-Bissau', 'capital': None}\n", + "{'index': 152, 'country_name': 'Guinea', 'capital': None}\n", + "{'index': 153, 'country_name': 'Guatemala', 'capital': None}\n", + "{'index': 154, 'country_name': 'Grenada', 'capital': None}\n", + "{'index': 155, 'country_name': 'Greece', 'capital': None}\n", + "{'index': 156, 'country_name': 'Ghana', 'capital': None}\n", + "{'index': 157, 'country_name': 'Germany', 'capital': None}\n", + "{'index': 158, 'country_name': 'Georgia (country)', 'capital': None}\n", + "{'index': 159, 'country_name': 'The Gambia', 'capital': None}\n", + "{'index': 160, 'country_name': 'Gabon', 'capital': None}\n", + "{'index': 161, 'country_name': 'Finland', 'capital': None}\n", + "{'index': 162, 'country_name': 'Fiji', 'capital': None}\n", + "{'index': 163, 'country_name': 'Ethiopia', 'capital': None}\n", + "{'index': 164, 'country_name': 'Eswatini', 'capital': None}\n", + "{'index': 165, 'country_name': 'Estonia', 'capital': None}\n", + "{'index': 166, 'country_name': 'France', 'capital': None}\n", + "{'index': 167, 'country_name': 'Eritrea', 'capital': None}\n", + "{'index': 168, 'country_name': 'Equatorial Guinea', 'capital': None}\n", + "{'index': 169, 'country_name': 'El Salvador', 'capital': None}\n", + "{'index': 170, 'country_name': 'Egypt', 'capital': None}\n", + "{'index': 171, 'country_name': 'Ecuador', 'capital': None}\n", + "{'index': 172, 'country_name': 'East Timor', 'capital': None}\n", + "{'index': 173, 'country_name': 'Dominica', 'capital': None}\n", + "{'index': 174, 'country_name': 'Dominican Republic', 'capital': None}\n", + "{'index': 175, 'country_name': 'Djibouti', 'capital': None}\n", + "{'index': 176, 'country_name': 'Danish Realm', 'capital': None}\n", + "{'index': 177, 'country_name': 'Czech Republic', 'capital': None}\n", + "{'index': 178, 'country_name': 'Cyprus', 'capital': None}\n", + "{'index': 179, 'country_name': 'Cuba', 'capital': None}\n", + "{'index': 180, 'country_name': 'Burkina Faso', 'capital': None}\n", + "{'index': 181, 'country_name': 'Bulgaria', 'capital': None}\n", + "{'index': 182, 'country_name': 'Brunei', 'capital': None}\n", + "{'index': 183, 'country_name': 'Brazil', 'capital': None}\n", + "{'index': 184, 'country_name': 'Botswana', 'capital': None}\n", + "{'index': 185, 'country_name': 'Bosnia and Herzegovina', 'capital': None}\n", + "{'index': 186, 'country_name': 'Bolivia', 'capital': None}\n", + "{'index': 187, 'country_name': 'Bhutan', 'capital': None}\n", + "{'index': 188, 'country_name': 'Benin', 'capital': None}\n", + "{'index': 189, 'country_name': 'Belize', 'capital': None}\n", + "{'index': 190, 'country_name': 'Belgium', 'capital': None}\n", + "{'index': 191, 'country_name': 'Belarus', 'capital': None}\n", + "{'index': 192, 'country_name': 'Barbados', 'capital': None}\n", + "{'index': 193, 'country_name': 'Bangladesh', 'capital': None}\n", + "{'index': 194, 'country_name': 'Bahrain', 'capital': None}\n", + "{'index': 195, 'country_name': 'The Bahamas', 'capital': None}\n", + "{'index': 196, 'country_name': 'Azerbaijan', 'capital': None}\n", + "{'index': 197, 'country_name': 'Austria', 'capital': None}\n", + "{'index': 198, 'country_name': 'Australia', 'capital': None}\n", + "{'index': 199, 'country_name': 'Armenia', 'capital': None}\n", + "{'index': 200, 'country_name': 'Argentina', 'capital': None}\n", + "{'index': 201, 'country_name': 'Antigua and Barbuda', 'capital': '\\n'}\n", + "{'index': 202, 'country_name': 'Angola', 'capital': None}\n", + "{'index': 203, 'country_name': 'Andorra', 'capital': None}\n", + "{'index': 204, 'country_name': 'Algeria', 'capital': None}\n", + "{'index': 205, 'country_name': 'Albania', 'capital': None}\n" + ] + } + ], + "source": [ + "for index, country_name, country_html in zip(\n", + " countries.index, countries[\"short_country_name\"], countries[\"country_html\"]\n", + "):\n", + " # print(json.dumps(extract_capital_0(index, country_name, country_html)))\n", + " print(extract_capital_0(index, country_name, country_html))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c147ae2-cc6f-4ac9-9c6a-81fb8e71d1d1", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4cc8e0f7-36ab-41b8-8ce9-d0ebe02a9896", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "914e0498-c75d-4c40-8320-ec62f5db8764", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T20:16:19.646094Z", + "iopub.status.busy": "2022-06-25T20:16:19.645699Z", + "iopub.status.idle": "2022-06-25T20:16:19.649925Z", + "shell.execute_reply": "2022-06-25T20:16:19.648982Z", + "shell.execute_reply.started": "2022-06-25T20:16:19.646064Z" + } + }, + "source": [ + "## Playground" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "id": "57d8b5a9-dd20-4214-866a-b57ecc7ad5da", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T22:09:14.820234Z", + "iopub.status.busy": "2022-06-25T22:09:14.819573Z", + "iopub.status.idle": "2022-06-25T22:09:14.830451Z", + "shell.execute_reply": "2022-06-25T22:09:14.829525Z", + "shell.execute_reply.started": "2022-06-25T22:09:14.820174Z" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "country_url https://en.wikipedia.org/wiki/Malaysia\n", + "flag_description_url https://en.wikipedia.org/wiki/Flag_of_Malaysia\n", + "short_country_name Malaysia\n", + "country_html
The \u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m index \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m116\u001b[39m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mextract_capital_0\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcountries\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43miloc\u001b[49m\u001b[43m[\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mshort_country_name\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcountries\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43miloc\u001b[49m\u001b[43m[\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcountry_html\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n", + "Input \u001b[0;32mIn [120]\u001b[0m, in \u001b[0;36mextract_capital_0\u001b[0;34m(index, country_name, country_html)\u001b[0m\n\u001b[1;32m 17\u001b[0m _capitals \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m capital_filter:\n\u001b[0;32m---> 19\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[43mre\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43md\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m, re\u001b[38;5;241m.\u001b[39mMatch):\n\u001b[1;32m 20\u001b[0m _capitals\u001b[38;5;241m.\u001b[39mappend(item\u001b[38;5;241m.\u001b[39mtext)\n\u001b[1;32m 21\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m match \u001b[38;5;129;01min\u001b[39;00m branch_filter:\n", + "File \u001b[0;32m~/.pyenv/versions/3.8.12/lib/python3.8/re.py:201\u001b[0m, in \u001b[0;36msearch\u001b[0;34m(pattern, string, flags)\u001b[0m\n\u001b[1;32m 198\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msearch\u001b[39m(pattern, string, flags\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m):\n\u001b[1;32m 199\u001b[0m \u001b[38;5;124;03m\"\"\"Scan through string looking for a match to the pattern, returning\u001b[39;00m\n\u001b[1;32m 200\u001b[0m \u001b[38;5;124;03m a Match object, or None if no match was found.\"\"\"\u001b[39;00m\n\u001b[0;32m--> 201\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_compile\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpattern\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mflags\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstring\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mTypeError\u001b[0m: expected string or bytes-like object" + ] + } + ], + "source": [ + "index = 116\n", + "extract_capital_0(index, countries.iloc[index][\"short_country_name\"], countries.iloc[index][\"country_html\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e066fde4-b290-49d7-80d2-64e6003efcec", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/playground/downloaded_data_inspection_lab/exploration.ipynb b/playground/downloaded_data_inspection_lab/exploration.ipynb new file mode 100644 index 0000000..9a6d80d --- /dev/null +++ b/playground/downloaded_data_inspection_lab/exploration.ipynb @@ -0,0 +1,599 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "d8185790-0793-4881-99e8-6730f95a8006", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-24T22:04:54.386982Z", + "iopub.status.busy": "2022-06-24T22:04:54.386313Z", + "iopub.status.idle": "2022-06-24T22:04:54.854521Z", + "shell.execute_reply": "2022-06-24T22:04:54.853581Z", + "shell.execute_reply.started": "2022-06-24T22:04:54.386910Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import json\n", + "import pathlib\n", + "\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ea2b3e33-d58e-4e30-a0cc-8218a1f252c9", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-24T22:04:55.458615Z", + "iopub.status.busy": "2022-06-24T22:04:55.457695Z", + "iopub.status.idle": "2022-06-24T22:04:55.475878Z", + "shell.execute_reply": "2022-06-24T22:04:55.474706Z", + "shell.execute_reply.started": "2022-06-24T22:04:55.458548Z" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[None]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_options = {\n", + " \"display.max_rows\": None,\n", + "}\n", + "\n", + "[pd.set_option(option, value) for option, value in pd_options.items()]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "36149580-91d9-431d-99c3-51feee829e79", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-24T22:04:56.134416Z", + "iopub.status.busy": "2022-06-24T22:04:56.133745Z", + "iopub.status.idle": "2022-06-24T22:04:56.140326Z", + "shell.execute_reply": "2022-06-24T22:04:56.138507Z", + "shell.execute_reply.started": "2022-06-24T22:04:56.134371Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "data_directory = (\n", + " pathlib.Path(\".\").resolve().parents[1] / \"data\" / \"scrapy\" / \"raw_country_data\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d03be94e-8642-4916-8a43-1711e0c21b36", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-24T22:04:56.621163Z", + "iopub.status.busy": "2022-06-24T22:04:56.620692Z", + "iopub.status.idle": "2022-06-24T22:04:56.731001Z", + "shell.execute_reply": "2022-06-24T22:04:56.728392Z", + "shell.execute_reply.started": "2022-06-24T22:04:56.621128Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "countries_file = data_directory / \"countries.json\"\n", + "countries = json.loads(countries_file.read_text())\n", + "# countries" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "29cca9ea-16d3-4534-8c9e-49fde37f8cdd", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T16:40:34.482040Z", + "iopub.status.busy": "2022-06-25T16:40:34.481539Z", + "iopub.status.idle": "2022-06-25T16:40:34.624178Z", + "shell.execute_reply": "2022-06-25T16:40:34.618757Z", + "shell.execute_reply.started": "2022-06-25T16:40:34.482012Z" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['country_url', 'flag_description_url', 'short_country_name',\n", + " 'country_html', 'flag_html', 'file_urls', 'files'],\n", + " dtype='object')" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_json(countries_file)\n", + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "ef8bc3ce-08dd-4260-807c-2616b2e1c1ba", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T16:40:32.210227Z", + "iopub.status.busy": "2022-06-25T16:40:32.209873Z", + "iopub.status.idle": "2022-06-25T16:40:32.223710Z", + "shell.execute_reply": "2022-06-25T16:40:32.222746Z", + "shell.execute_reply.started": "2022-06-25T16:40:32.210199Z" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(206,)\n", + "[False]\n", + "[False]\n" + ] + } + ], + "source": [ + "country_url = df[\"country_url\"]\n", + "print(country_url.shape)\n", + "print(country_url.isnull().unique())\n", + "print(country_url.isna().unique())" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "48db8f93-659b-45a4-8477-a7cec139bebc", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-24T22:04:59.710467Z", + "iopub.status.busy": "2022-06-24T22:04:59.709874Z", + "iopub.status.idle": "2022-06-24T22:04:59.720517Z", + "shell.execute_reply": "2022-06-24T22:04:59.717623Z", + "shell.execute_reply.started": "2022-06-24T22:04:59.710431Z" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(206,)\n", + "[False]\n", + "[False]\n" + ] + } + ], + "source": [ + "short_country_name = df[\"short_country_name\"]\n", + "print(short_country_name.shape)\n", + "print(short_country_name.isnull().unique())\n", + "print(short_country_name.isna().unique())" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "a52f6aa2-5bbd-46e4-9b2f-cdbd7269cb6e", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T16:40:28.811675Z", + "iopub.status.busy": "2022-06-25T16:40:28.810872Z", + "iopub.status.idle": "2022-06-25T16:40:28.831883Z", + "shell.execute_reply": "2022-06-25T16:40:28.831066Z", + "shell.execute_reply.started": "2022-06-25T16:40:28.811646Z" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(206,)\n", + "[False]\n", + "[False]\n" + ] + } + ], + "source": [ + "flag_html = df[\"flag_html\"]\n", + "print(flag_html.shape)\n", + "print(flag_html.isnull().unique())\n", + "print(flag_html.isna().unique())" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "643e6512-1e5b-4eb2-9f0a-6b680ada787b", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T16:40:24.737030Z", + "iopub.status.busy": "2022-06-25T16:40:24.735571Z", + "iopub.status.idle": "2022-06-25T16:40:24.746143Z", + "shell.execute_reply": "2022-06-25T16:40:24.745147Z", + "shell.execute_reply.started": "2022-06-25T16:40:24.737001Z" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [country_url, flag_description_url, short_country_name, country_html, flag_html, file_urls, files]\n", + "Index: []" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df[\"flag_html\"].isnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "5e21e98a-56ba-4e55-b5d4-89dab2232c29", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T16:40:22.482698Z", + "iopub.status.busy": "2022-06-25T16:40:22.482421Z", + "iopub.status.idle": "2022-06-25T16:40:22.535788Z", + "shell.execute_reply": "2022-06-25T16:40:22.534754Z", + "shell.execute_reply.started": "2022-06-25T16:40:22.482676Z" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [country_url, flag_description_url, short_country_name, country_html, flag_html, file_urls, files]\n", + "Index: []" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df[\"flag_html\"].isna()]" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "227b0c76-9e45-4849-849e-36355976cba9", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T16:41:54.100120Z", + "iopub.status.busy": "2022-06-25T16:41:54.099765Z", + "iopub.status.idle": "2022-06-25T16:41:54.105557Z", + "shell.execute_reply": "2022-06-25T16:41:54.104656Z", + "shell.execute_reply.started": "2022-06-25T16:41:54.100092Z" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Capital
and largest city
Maputo
25°57′S 32°35′E\\ufeff / \\ufeff25.950°S 32.583°E\\ufeff / -25.950; 32.583'" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.iloc[102][\"country_html\"][4]" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "f7712d7d-9074-4fc5-89f2-6e5f47c57d20", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T16:42:32.237443Z", + "iopub.status.busy": "2022-06-25T16:42:32.237093Z", + "iopub.status.idle": "2022-06-25T16:42:32.246099Z", + "shell.execute_reply": "2022-06-25T16:42:32.245261Z", + "shell.execute_reply.started": "2022-06-25T16:42:32.237414Z" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "206\n" + ] + }, + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(len([file for file in df.files if len(file) != 0]))\n", + "[file for file in df.files if len(file) == 0]" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "d7e60156-1ee5-4bf9-ab9a-d529ee988301", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T16:45:32.482376Z", + "iopub.status.busy": "2022-06-25T16:45:32.480141Z", + "iopub.status.idle": "2022-06-25T16:45:32.514075Z", + "shell.execute_reply": "2022-06-25T16:45:32.513181Z", + "shell.execute_reply.started": "2022-06-25T16:45:32.482286Z" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
9https://en.wikipedia.org/wiki/Central_African_...https://en.wikipedia.org/wiki/Flag_of_the_Cent...Central_African_Republic[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <b>flag of the <a href=\"/wiki/Central_A...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
58https://en.wikipedia.org/wiki/South_Africahttps://en.wikipedia.org/wiki/Flag_of_South_Af...South_Africa[<tr><th colspan=\"2\" class=\"infobox-above adr\"...<p>The <b>flag of South Africa</b> was designe...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
\n", + "
" + ], + "text/plain": [ + " country_url \\\n", + "9 https://en.wikipedia.org/wiki/Central_African_... \n", + "58 https://en.wikipedia.org/wiki/South_Africa \n", + "\n", + " flag_description_url \\\n", + "9 https://en.wikipedia.org/wiki/Flag_of_the_Cent... \n", + "58 https://en.wikipedia.org/wiki/Flag_of_South_Af... \n", + "\n", + " short_country_name \\\n", + "9 Central_African_Republic \n", + "58 South_Africa \n", + "\n", + " country_html \\\n", + "9 [The flag of the The flag of South Africa was designe... \n", + "\n", + " file_urls \\\n", + "9 [https:////upload.wikimedia.org/wikipedia/comm... \n", + "58 [https:////upload.wikimedia.org/wikipedia/comm... \n", + "\n", + " files \n", + "9 [{'url': 'https://upload.wikimedia.org/wikiped... \n", + "58 [{'url': 'https://upload.wikimedia.org/wikiped... " + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df[\"short_country_name\"].map(lambda country: \"Africa\" in country)]" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "97c1e41f-30f3-4116-aa11-5797e05b95ba", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T16:48:57.030064Z", + "iopub.status.busy": "2022-06-25T16:48:57.029671Z", + "iopub.status.idle": "2022-06-25T16:48:57.035861Z", + "shell.execute_reply": "2022-06-25T16:48:57.035069Z", + "shell.execute_reply.started": "2022-06-25T16:48:57.030033Z" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Capital
'" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.iloc[58][\"country_html\"][15]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aff1e06-d054-40db-8203-7343ab914de9", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + }, + "toc-autonumbering": true, + "toc-showcode": false + }, + "nbformat": 4, + "nbformat_minor": 5 +}