chore: update playground
This commit is contained in:
@@ -1,122 +0,0 @@
|
|||||||
# from __future__ import annotations
|
|
||||||
|
|
||||||
# import re
|
|
||||||
|
|
||||||
# import scrapy
|
|
||||||
# from scrapy.http import TextResponse
|
|
||||||
|
|
||||||
# from wikipedia_country_scraper.items import WikipediaCountryScraperItem
|
|
||||||
|
|
||||||
|
|
||||||
# class CountrydownloaderSpider(scrapy.Spider):
|
|
||||||
# name = "CountrydownloaderSpider"
|
|
||||||
|
|
||||||
# def start_requests(self):
|
|
||||||
# return [
|
|
||||||
# scrapy.Request(
|
|
||||||
# url="https://en.wikipedia.org/wiki/List_of_sovereign_states", callback=self.extract_country_urls
|
|
||||||
# )
|
|
||||||
# ]
|
|
||||||
|
|
||||||
# def extract_country_urls(self, response: TextResponse):
|
|
||||||
# country_urls_xpath = response.xpath(
|
|
||||||
# "//table[contains(@class, 'sortable') and contains(@class, 'wikitable')]/tbody/tr[not(contains(@style, 'background'))]/td[1 and contains(@style, 'vertical-align:top;')]/b/a/@href"
|
|
||||||
# ).getall()
|
|
||||||
|
|
||||||
# for url in country_urls_xpath:
|
|
||||||
# # for url in country_urls_xpath[:3]:
|
|
||||||
# regex_match = re.search(r"\/wiki\/(?P<short_country_name>[^$]*)", url)
|
|
||||||
# yield scrapy.Request(
|
|
||||||
# url=f"https://en.wikipedia.org{url}",
|
|
||||||
# callback=self.extract_country_information,
|
|
||||||
# cb_kwargs={
|
|
||||||
# "country_item": {
|
|
||||||
# "country_url": f"https://en.wikipedia.org{url}",
|
|
||||||
# "short_country_name": regex_match["short_country_name"]
|
|
||||||
# if isinstance(regex_match, re.Match)
|
|
||||||
# else None,
|
|
||||||
# }
|
|
||||||
# },
|
|
||||||
# )
|
|
||||||
|
|
||||||
# def extract_country_information(self, response: TextResponse, country_item: dict):
|
|
||||||
# country_information_xpath = response.xpath("//table[contains(@class, 'infobox')]/tbody/tr").getall()
|
|
||||||
|
|
||||||
# flag_image_url = response.xpath(
|
|
||||||
# "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[1]/a/@href"
|
|
||||||
# ).get()
|
|
||||||
# flag_description_url = response.xpath(
|
|
||||||
# "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[2]/a/@href"
|
|
||||||
# ).get()
|
|
||||||
|
|
||||||
# anthem_page_url = response.xpath(
|
|
||||||
# "//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//span[contains(@class, 'audio')]/a/@href"
|
|
||||||
# ).get()
|
|
||||||
|
|
||||||
# country_item = {
|
|
||||||
# **country_item,
|
|
||||||
# "country": country_information_xpath,
|
|
||||||
# }
|
|
||||||
|
|
||||||
# yield scrapy.Request(
|
|
||||||
# url=f"https://en.wikipedia.org{flag_description_url}",
|
|
||||||
# callback=self.extract_flag_description,
|
|
||||||
# cb_kwargs={
|
|
||||||
# "country_item": country_item,
|
|
||||||
# "urls": {
|
|
||||||
# "flag_image_url": f"https://en.wikipedia.org{flag_image_url}",
|
|
||||||
# "anthem_page_url": f"https://en.wikipedia.org{anthem_page_url}",
|
|
||||||
# },
|
|
||||||
# },
|
|
||||||
# )
|
|
||||||
|
|
||||||
# def extract_flag_description(self, response: TextResponse, country_item: dict, urls: dict):
|
|
||||||
# flag_description_xpath = response.xpath(
|
|
||||||
# "//div[contains(@id, 'mw-content-text')]/div/p[not(contains(@class, 'mw-empty-elt'))]"
|
|
||||||
# ).get()
|
|
||||||
# country_item = {**country_item, "flag_description": flag_description_xpath}
|
|
||||||
|
|
||||||
# yield scrapy.Request(
|
|
||||||
# url=urls["flag_image_url"],
|
|
||||||
# callback=self.extract_flag_images,
|
|
||||||
# cb_kwargs={
|
|
||||||
# "country_item": country_item,
|
|
||||||
# "urls": urls,
|
|
||||||
# },
|
|
||||||
# )
|
|
||||||
|
|
||||||
# def extract_flag_images(self, response: TextResponse, country_item: dict, urls: dict):
|
|
||||||
# flag_image_xpath = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get()
|
|
||||||
# country_item = {**country_item, "flag_image_url": f"https:{flag_image_xpath}"}
|
|
||||||
|
|
||||||
# yield scrapy.Request(
|
|
||||||
# url=urls["anthem_page_url"],
|
|
||||||
# callback=self.extract_anthem_file,
|
|
||||||
# cb_kwargs={
|
|
||||||
# "country_item": country_item,
|
|
||||||
# "urls": urls,
|
|
||||||
# },
|
|
||||||
# )
|
|
||||||
|
|
||||||
# def extract_anthem_file(self, response: TextResponse, country_item: dict, urls: dict):
|
|
||||||
# anthem_text = response.xpath("//div[@id='mw-content-text']/div/p").get()
|
|
||||||
# _anthem_file_url = response.xpath("//tr[contains(@class, 'haudio')]//a/@href").getall()
|
|
||||||
|
|
||||||
# anthem_file_url = next(
|
|
||||||
# (file for file in _anthem_file_url if file.endswith(".ogg") or file.endswith(".oga")), None
|
|
||||||
# )
|
|
||||||
|
|
||||||
# country_scrapy_item = WikipediaCountryScraperItem()
|
|
||||||
# country_scrapy_item["country_url"] = country_item["country_url"]
|
|
||||||
# country_scrapy_item["short_country_name"] = country_item["short_country_name"]
|
|
||||||
# country_scrapy_item["country"] = country_item["country"]
|
|
||||||
# country_scrapy_item["flag_description"] = country_item["flag_description"]
|
|
||||||
# country_scrapy_item["anthem"] = anthem_text
|
|
||||||
# country_scrapy_item["anthem_url"] = urls["anthem_page_url"]
|
|
||||||
# country_scrapy_item["anthem_file_url"] = f"https://en.wikipedia.org{anthem_file_url}"
|
|
||||||
# country_scrapy_item["file_urls"] = [
|
|
||||||
# country_item["flag_image_url"],
|
|
||||||
# f"https://en.wikipedia.org{anthem_file_url}",
|
|
||||||
# ]
|
|
||||||
|
|
||||||
# yield country_scrapy_item
|
|
||||||
12
docs/deck_readme.md
Normal file
12
docs/deck_readme.md
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
|
||||||
|
Although there are many really good geography focused decks already, some are out of date, or contain way more information than needed to learn the topic at hand.
|
||||||
|
|
||||||
|
For a deck to learn Capital cities/flags, having extra information like the population, language etc. is nice to have. But I personally find it distracting to have this information on a card. When learning Capital cities I just want the country and capital - nothing more.
|
||||||
|
|
||||||
|
The idea behind these decks:
|
||||||
|
|
||||||
|
- Should be up to date and current.
|
||||||
|
- Should use an authorative source (i.e Wikipedia/Google Maps).
|
||||||
|
- Should be easy to update.
|
||||||
|
- Should be automatically generated - no user tweaking required to get the core deck.
|
||||||
|
- Should be open source - the code is freely available with detailed instructions at <https://github.com/dtomlinson91/geography-anki>.
|
||||||
File diff suppressed because one or more lines are too long
2149
playground/downloaded_data_inspection_lab/capital_xpath.ipynb
Normal file
2149
playground/downloaded_data_inspection_lab/capital_xpath.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
599
playground/downloaded_data_inspection_lab/exploration.ipynb
Normal file
599
playground/downloaded_data_inspection_lab/exploration.ipynb
Normal file
@@ -0,0 +1,599 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "d8185790-0793-4881-99e8-6730f95a8006",
|
||||||
|
"metadata": {
|
||||||
|
"execution": {
|
||||||
|
"iopub.execute_input": "2022-06-24T22:04:54.386982Z",
|
||||||
|
"iopub.status.busy": "2022-06-24T22:04:54.386313Z",
|
||||||
|
"iopub.status.idle": "2022-06-24T22:04:54.854521Z",
|
||||||
|
"shell.execute_reply": "2022-06-24T22:04:54.853581Z",
|
||||||
|
"shell.execute_reply.started": "2022-06-24T22:04:54.386910Z"
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import json\n",
|
||||||
|
"import pathlib\n",
|
||||||
|
"\n",
|
||||||
|
"import pandas as pd"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "ea2b3e33-d58e-4e30-a0cc-8218a1f252c9",
|
||||||
|
"metadata": {
|
||||||
|
"execution": {
|
||||||
|
"iopub.execute_input": "2022-06-24T22:04:55.458615Z",
|
||||||
|
"iopub.status.busy": "2022-06-24T22:04:55.457695Z",
|
||||||
|
"iopub.status.idle": "2022-06-24T22:04:55.475878Z",
|
||||||
|
"shell.execute_reply": "2022-06-24T22:04:55.474706Z",
|
||||||
|
"shell.execute_reply.started": "2022-06-24T22:04:55.458548Z"
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[None]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"pd_options = {\n",
|
||||||
|
" \"display.max_rows\": None,\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"[pd.set_option(option, value) for option, value in pd_options.items()]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "36149580-91d9-431d-99c3-51feee829e79",
|
||||||
|
"metadata": {
|
||||||
|
"execution": {
|
||||||
|
"iopub.execute_input": "2022-06-24T22:04:56.134416Z",
|
||||||
|
"iopub.status.busy": "2022-06-24T22:04:56.133745Z",
|
||||||
|
"iopub.status.idle": "2022-06-24T22:04:56.140326Z",
|
||||||
|
"shell.execute_reply": "2022-06-24T22:04:56.138507Z",
|
||||||
|
"shell.execute_reply.started": "2022-06-24T22:04:56.134371Z"
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data_directory = (\n",
|
||||||
|
" pathlib.Path(\".\").resolve().parents[1] / \"data\" / \"scrapy\" / \"raw_country_data\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "d03be94e-8642-4916-8a43-1711e0c21b36",
|
||||||
|
"metadata": {
|
||||||
|
"execution": {
|
||||||
|
"iopub.execute_input": "2022-06-24T22:04:56.621163Z",
|
||||||
|
"iopub.status.busy": "2022-06-24T22:04:56.620692Z",
|
||||||
|
"iopub.status.idle": "2022-06-24T22:04:56.731001Z",
|
||||||
|
"shell.execute_reply": "2022-06-24T22:04:56.728392Z",
|
||||||
|
"shell.execute_reply.started": "2022-06-24T22:04:56.621128Z"
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"countries_file = data_directory / \"countries.json\"\n",
|
||||||
|
"countries = json.loads(countries_file.read_text())\n",
|
||||||
|
"# countries"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 25,
|
||||||
|
"id": "29cca9ea-16d3-4534-8c9e-49fde37f8cdd",
|
||||||
|
"metadata": {
|
||||||
|
"execution": {
|
||||||
|
"iopub.execute_input": "2022-06-25T16:40:34.482040Z",
|
||||||
|
"iopub.status.busy": "2022-06-25T16:40:34.481539Z",
|
||||||
|
"iopub.status.idle": "2022-06-25T16:40:34.624178Z",
|
||||||
|
"shell.execute_reply": "2022-06-25T16:40:34.618757Z",
|
||||||
|
"shell.execute_reply.started": "2022-06-25T16:40:34.482012Z"
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"Index(['country_url', 'flag_description_url', 'short_country_name',\n",
|
||||||
|
" 'country_html', 'flag_html', 'file_urls', 'files'],\n",
|
||||||
|
" dtype='object')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 25,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df = pd.read_json(countries_file)\n",
|
||||||
|
"df.columns"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 24,
|
||||||
|
"id": "ef8bc3ce-08dd-4260-807c-2616b2e1c1ba",
|
||||||
|
"metadata": {
|
||||||
|
"execution": {
|
||||||
|
"iopub.execute_input": "2022-06-25T16:40:32.210227Z",
|
||||||
|
"iopub.status.busy": "2022-06-25T16:40:32.209873Z",
|
||||||
|
"iopub.status.idle": "2022-06-25T16:40:32.223710Z",
|
||||||
|
"shell.execute_reply": "2022-06-25T16:40:32.222746Z",
|
||||||
|
"shell.execute_reply.started": "2022-06-25T16:40:32.210199Z"
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"(206,)\n",
|
||||||
|
"[False]\n",
|
||||||
|
"[False]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"country_url = df[\"country_url\"]\n",
|
||||||
|
"print(country_url.shape)\n",
|
||||||
|
"print(country_url.isnull().unique())\n",
|
||||||
|
"print(country_url.isna().unique())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "48db8f93-659b-45a4-8477-a7cec139bebc",
|
||||||
|
"metadata": {
|
||||||
|
"execution": {
|
||||||
|
"iopub.execute_input": "2022-06-24T22:04:59.710467Z",
|
||||||
|
"iopub.status.busy": "2022-06-24T22:04:59.709874Z",
|
||||||
|
"iopub.status.idle": "2022-06-24T22:04:59.720517Z",
|
||||||
|
"shell.execute_reply": "2022-06-24T22:04:59.717623Z",
|
||||||
|
"shell.execute_reply.started": "2022-06-24T22:04:59.710431Z"
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"(206,)\n",
|
||||||
|
"[False]\n",
|
||||||
|
"[False]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"short_country_name = df[\"short_country_name\"]\n",
|
||||||
|
"print(short_country_name.shape)\n",
|
||||||
|
"print(short_country_name.isnull().unique())\n",
|
||||||
|
"print(short_country_name.isna().unique())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 23,
|
||||||
|
"id": "a52f6aa2-5bbd-46e4-9b2f-cdbd7269cb6e",
|
||||||
|
"metadata": {
|
||||||
|
"execution": {
|
||||||
|
"iopub.execute_input": "2022-06-25T16:40:28.811675Z",
|
||||||
|
"iopub.status.busy": "2022-06-25T16:40:28.810872Z",
|
||||||
|
"iopub.status.idle": "2022-06-25T16:40:28.831883Z",
|
||||||
|
"shell.execute_reply": "2022-06-25T16:40:28.831066Z",
|
||||||
|
"shell.execute_reply.started": "2022-06-25T16:40:28.811646Z"
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"(206,)\n",
|
||||||
|
"[False]\n",
|
||||||
|
"[False]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"flag_html = df[\"flag_html\"]\n",
|
||||||
|
"print(flag_html.shape)\n",
|
||||||
|
"print(flag_html.isnull().unique())\n",
|
||||||
|
"print(flag_html.isna().unique())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 22,
|
||||||
|
"id": "643e6512-1e5b-4eb2-9f0a-6b680ada787b",
|
||||||
|
"metadata": {
|
||||||
|
"execution": {
|
||||||
|
"iopub.execute_input": "2022-06-25T16:40:24.737030Z",
|
||||||
|
"iopub.status.busy": "2022-06-25T16:40:24.735571Z",
|
||||||
|
"iopub.status.idle": "2022-06-25T16:40:24.746143Z",
|
||||||
|
"shell.execute_reply": "2022-06-25T16:40:24.745147Z",
|
||||||
|
"shell.execute_reply.started": "2022-06-25T16:40:24.737001Z"
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>country_url</th>\n",
|
||||||
|
" <th>flag_description_url</th>\n",
|
||||||
|
" <th>short_country_name</th>\n",
|
||||||
|
" <th>country_html</th>\n",
|
||||||
|
" <th>flag_html</th>\n",
|
||||||
|
" <th>file_urls</th>\n",
|
||||||
|
" <th>files</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"Empty DataFrame\n",
|
||||||
|
"Columns: [country_url, flag_description_url, short_country_name, country_html, flag_html, file_urls, files]\n",
|
||||||
|
"Index: []"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 22,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df[df[\"flag_html\"].isnull()]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"id": "5e21e98a-56ba-4e55-b5d4-89dab2232c29",
|
||||||
|
"metadata": {
|
||||||
|
"execution": {
|
||||||
|
"iopub.execute_input": "2022-06-25T16:40:22.482698Z",
|
||||||
|
"iopub.status.busy": "2022-06-25T16:40:22.482421Z",
|
||||||
|
"iopub.status.idle": "2022-06-25T16:40:22.535788Z",
|
||||||
|
"shell.execute_reply": "2022-06-25T16:40:22.534754Z",
|
||||||
|
"shell.execute_reply.started": "2022-06-25T16:40:22.482676Z"
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>country_url</th>\n",
|
||||||
|
" <th>flag_description_url</th>\n",
|
||||||
|
" <th>short_country_name</th>\n",
|
||||||
|
" <th>country_html</th>\n",
|
||||||
|
" <th>flag_html</th>\n",
|
||||||
|
" <th>file_urls</th>\n",
|
||||||
|
" <th>files</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"Empty DataFrame\n",
|
||||||
|
"Columns: [country_url, flag_description_url, short_country_name, country_html, flag_html, file_urls, files]\n",
|
||||||
|
"Index: []"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df[df[\"flag_html\"].isna()]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 33,
|
||||||
|
"id": "227b0c76-9e45-4849-849e-36355976cba9",
|
||||||
|
"metadata": {
|
||||||
|
"execution": {
|
||||||
|
"iopub.execute_input": "2022-06-25T16:41:54.100120Z",
|
||||||
|
"iopub.status.busy": "2022-06-25T16:41:54.099765Z",
|
||||||
|
"iopub.status.idle": "2022-06-25T16:41:54.105557Z",
|
||||||
|
"shell.execute_reply": "2022-06-25T16:41:54.104656Z",
|
||||||
|
"shell.execute_reply.started": "2022-06-25T16:41:54.100092Z"
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'<tr><th scope=\"row\" class=\"infobox-label\">Capital<div class=\"ib-country-largest\">and largest city</div></th><td class=\"infobox-data\"><a href=\"/wiki/Maputo\" title=\"Maputo\">Maputo</a><br><link rel=\"mw-deduplicated-inline-style\" href=\"mw-data:TemplateStyles:r1073938472\"><span class=\"plainlinks nourlexpansion\"><a class=\"external text\" href=\"//geohack.toolforge.org/geohack.php?pagename=Mozambique&params=25_57_S_32_35_E_type:city_region:MZ\"><span class=\"geo-default\"><span class=\"geo-dms\" title=\"Maps, aerial photos, and other data for this location\"><span class=\"latitude\">25°57′S</span> <span class=\"longitude\">32°35′E</span></span></span><span class=\"geo-multi-punct\">\\ufeff / \\ufeff</span><span class=\"geo-nondefault\"><span class=\"geo-dec\" title=\"Maps, aerial photos, and other data for this location\">25.950°S 32.583°E</span><span style=\"display:none\">\\ufeff / <span class=\"geo\">-25.950; 32.583</span></span></span></a></span></td></tr>'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 33,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df.iloc[102][\"country_html\"][4]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 34,
|
||||||
|
"id": "f7712d7d-9074-4fc5-89f2-6e5f47c57d20",
|
||||||
|
"metadata": {
|
||||||
|
"execution": {
|
||||||
|
"iopub.execute_input": "2022-06-25T16:42:32.237443Z",
|
||||||
|
"iopub.status.busy": "2022-06-25T16:42:32.237093Z",
|
||||||
|
"iopub.status.idle": "2022-06-25T16:42:32.246099Z",
|
||||||
|
"shell.execute_reply": "2022-06-25T16:42:32.245261Z",
|
||||||
|
"shell.execute_reply.started": "2022-06-25T16:42:32.237414Z"
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"206\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 34,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(len([file for file in df.files if len(file) != 0]))\n",
|
||||||
|
"[file for file in df.files if len(file) == 0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 42,
|
||||||
|
"id": "d7e60156-1ee5-4bf9-ab9a-d529ee988301",
|
||||||
|
"metadata": {
|
||||||
|
"execution": {
|
||||||
|
"iopub.execute_input": "2022-06-25T16:45:32.482376Z",
|
||||||
|
"iopub.status.busy": "2022-06-25T16:45:32.480141Z",
|
||||||
|
"iopub.status.idle": "2022-06-25T16:45:32.514075Z",
|
||||||
|
"shell.execute_reply": "2022-06-25T16:45:32.513181Z",
|
||||||
|
"shell.execute_reply.started": "2022-06-25T16:45:32.482286Z"
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>country_url</th>\n",
|
||||||
|
" <th>flag_description_url</th>\n",
|
||||||
|
" <th>short_country_name</th>\n",
|
||||||
|
" <th>country_html</th>\n",
|
||||||
|
" <th>flag_html</th>\n",
|
||||||
|
" <th>file_urls</th>\n",
|
||||||
|
" <th>files</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>9</th>\n",
|
||||||
|
" <td>https://en.wikipedia.org/wiki/Central_African_...</td>\n",
|
||||||
|
" <td>https://en.wikipedia.org/wiki/Flag_of_the_Cent...</td>\n",
|
||||||
|
" <td>Central_African_Republic</td>\n",
|
||||||
|
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
||||||
|
" <td><p>The <b>flag of the <a href=\"/wiki/Central_A...</td>\n",
|
||||||
|
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||||||
|
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>58</th>\n",
|
||||||
|
" <td>https://en.wikipedia.org/wiki/South_Africa</td>\n",
|
||||||
|
" <td>https://en.wikipedia.org/wiki/Flag_of_South_Af...</td>\n",
|
||||||
|
" <td>South_Africa</td>\n",
|
||||||
|
" <td>[<tr><th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
|
||||||
|
" <td><p>The <b>flag of South Africa</b> was designe...</td>\n",
|
||||||
|
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||||||
|
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" country_url \\\n",
|
||||||
|
"9 https://en.wikipedia.org/wiki/Central_African_... \n",
|
||||||
|
"58 https://en.wikipedia.org/wiki/South_Africa \n",
|
||||||
|
"\n",
|
||||||
|
" flag_description_url \\\n",
|
||||||
|
"9 https://en.wikipedia.org/wiki/Flag_of_the_Cent... \n",
|
||||||
|
"58 https://en.wikipedia.org/wiki/Flag_of_South_Af... \n",
|
||||||
|
"\n",
|
||||||
|
" short_country_name \\\n",
|
||||||
|
"9 Central_African_Republic \n",
|
||||||
|
"58 South_Africa \n",
|
||||||
|
"\n",
|
||||||
|
" country_html \\\n",
|
||||||
|
"9 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... \n",
|
||||||
|
"58 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... \n",
|
||||||
|
"\n",
|
||||||
|
" flag_html \\\n",
|
||||||
|
"9 <p>The <b>flag of the <a href=\"/wiki/Central_A... \n",
|
||||||
|
"58 <p>The <b>flag of South Africa</b> was designe... \n",
|
||||||
|
"\n",
|
||||||
|
" file_urls \\\n",
|
||||||
|
"9 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||||||
|
"58 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||||||
|
"\n",
|
||||||
|
" files \n",
|
||||||
|
"9 [{'url': 'https://upload.wikimedia.org/wikiped... \n",
|
||||||
|
"58 [{'url': 'https://upload.wikimedia.org/wikiped... "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 42,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df[df[\"short_country_name\"].map(lambda country: \"Africa\" in country)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 51,
|
||||||
|
"id": "97c1e41f-30f3-4116-aa11-5797e05b95ba",
|
||||||
|
"metadata": {
|
||||||
|
"execution": {
|
||||||
|
"iopub.execute_input": "2022-06-25T16:48:57.030064Z",
|
||||||
|
"iopub.status.busy": "2022-06-25T16:48:57.029671Z",
|
||||||
|
"iopub.status.idle": "2022-06-25T16:48:57.035861Z",
|
||||||
|
"shell.execute_reply": "2022-06-25T16:48:57.035069Z",
|
||||||
|
"shell.execute_reply.started": "2022-06-25T16:48:57.030033Z"
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'<tr><th scope=\"row\" class=\"infobox-label\">Capital</th><td class=\"infobox-data\"><div class=\"plainlist\"><ul><li><a href=\"/wiki/Pretoria\" title=\"Pretoria\">Pretoria</a> (executive)<sup id=\"cite_ref-South_Africa_at_a_glance_2-0\" class=\"reference\"><a href=\"#cite_note-South_Africa_at_a_glance-2\">[2]</a></sup></li><li><a href=\"/wiki/Cape_Town\" title=\"Cape Town\">Cape Town</a> (legislative)<sup id=\"cite_ref-South_Africa_at_a_glance_2-1\" class=\"reference\"><a href=\"#cite_note-South_Africa_at_a_glance-2\">[2]</a></sup></li><li><a href=\"/wiki/Bloemfontein\" title=\"Bloemfontein\">Bloemfontein</a> (judicial)<sup id=\"cite_ref-South_Africa_at_a_glance_2-2\" class=\"reference\"><a href=\"#cite_note-South_Africa_at_a_glance-2\">[2]</a></sup></li></ul></div></td></tr>'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 51,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df.iloc[58][\"country_html\"][15]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "2aff1e06-d054-40db-8203-7343ab914de9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.8.12"
|
||||||
|
},
|
||||||
|
"toc-autonumbering": true,
|
||||||
|
"toc-showcode": false
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user