chore: add capitals spider
This commit is contained in:
@@ -4,6 +4,7 @@
|
||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
from wikipedia_country_scraper.items import WikipediaCountryScraperItem
|
||||
|
||||
|
||||
class WikipediaCountryScraperItem(scrapy.Item):
|
||||
@@ -24,3 +25,8 @@ class AnthemsItem(scrapy.Item):
|
||||
|
||||
file_urls = scrapy.Field()
|
||||
files = scrapy.Field()
|
||||
|
||||
|
||||
class CapitalsItem(scrapy.Item):
|
||||
country_name = scrapy.Field()
|
||||
capitals = scrapy.Field()
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
import scrapy
|
||||
from scrapy.http import TextResponse
|
||||
from wikipedia_country_scraper.items import CapitalsItem
|
||||
|
||||
|
||||
class CapitalsSpider(scrapy.Spider):
|
||||
name = "capitals"
|
||||
start_urls = [
|
||||
"https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_and_their_capitals_in_native_languages"
|
||||
]
|
||||
custom_settings = {
|
||||
"FEEDS": {
|
||||
pathlib.Path(__file__).resolve().parents[4]
|
||||
/ "data"
|
||||
/ "scrapy"
|
||||
/ "raw_country_data"
|
||||
/ "capitals.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
|
||||
},
|
||||
}
|
||||
|
||||
def parse(self, response: TextResponse):
|
||||
_country = response.xpath("//table[@class='wikitable']/tbody/tr[not(@style='background:#ccc;')]")
|
||||
|
||||
country_names = [country_name.get() for country_name in _country.xpath("td[1]//a[@title]/text()")]
|
||||
|
||||
capital_names = []
|
||||
for capital in _country.xpath("td[2]"):
|
||||
_capitals = [subcapital.get() for subcapital in capital.xpath("a[@title]/text()")]
|
||||
capital_names.append(_capitals)
|
||||
|
||||
for country, capitals in zip(country_names, capital_names):
|
||||
capital_item = CapitalsItem()
|
||||
|
||||
capital_item["country_name"] = country
|
||||
capital_item["capitals"] = capitals
|
||||
|
||||
yield capital_item
|
||||
@@ -1561,7 +1561,9 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"countries[countries[\"short_country_name\"].map(lambda country: \"arab\" in country.lower())]"
|
||||
"countries[\n",
|
||||
" countries[\"short_country_name\"].map(lambda country: \"arab\" in country.lower())\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1661,7 +1663,10 @@
|
||||
" for match in branch_filter:\n",
|
||||
" print(etree.tostring(match))\n",
|
||||
" print(\"match\", match.text)\n",
|
||||
" if isinstance(re.search(r\"(?:i\\>)([^\\<)]*)\", etree.tostring(match).decode(\"utf-8\")), re.Match):\n",
|
||||
" if isinstance(\n",
|
||||
" re.search(r\"(?:i\\>)([^\\<)]*)\", etree.tostring(match).decode(\"utf-8\")),\n",
|
||||
" re.Match,\n",
|
||||
" ):\n",
|
||||
" _capitals.append(match.text)\n",
|
||||
" result = {\n",
|
||||
" \"index\": 20,\n",
|
||||
@@ -1702,6 +1707,169 @@
|
||||
"result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7dbe415a-ba82-4813-9723-ac66ec9b29aa",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### State of Palestine"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 212,
|
||||
"id": "b39a6451-19e0-4ec6-a925-89bcdb89c441",
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2022-06-25T23:47:35.380713Z",
|
||||
"iopub.status.busy": "2022-06-25T23:47:35.380097Z",
|
||||
"iopub.status.idle": "2022-06-25T23:47:35.485628Z",
|
||||
"shell.execute_reply": "2022-06-25T23:47:35.484621Z",
|
||||
"shell.execute_reply.started": "2022-06-25T23:47:35.380654Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>country_url</th>\n",
|
||||
" <th>flag_description_url</th>\n",
|
||||
" <th>short_country_name</th>\n",
|
||||
" <th>country_html</th>\n",
|
||||
" <th>flag_html</th>\n",
|
||||
" <th>file_urls</th>\n",
|
||||
" <th>files</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>87</th>\n",
|
||||
" <td>https://en.wikipedia.org/wiki/State_of_Palestine</td>\n",
|
||||
" <td>https://en.wikipedia.org/wiki/Flag_of_Palestine</td>\n",
|
||||
" <td>State of Palestine</td>\n",
|
||||
" <td><div><tr><th colspan=\"2\" class=\"infobox-above ...</td>\n",
|
||||
" <td><p>The <b>flag of Palestine</b> (<a href=\"/wik...</td>\n",
|
||||
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
|
||||
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" country_url \\\n",
|
||||
"87 https://en.wikipedia.org/wiki/State_of_Palestine \n",
|
||||
"\n",
|
||||
" flag_description_url short_country_name \\\n",
|
||||
"87 https://en.wikipedia.org/wiki/Flag_of_Palestine State of Palestine \n",
|
||||
"\n",
|
||||
" country_html \\\n",
|
||||
"87 <div><tr><th colspan=\"2\" class=\"infobox-above ... \n",
|
||||
"\n",
|
||||
" flag_html \\\n",
|
||||
"87 <p>The <b>flag of Palestine</b> (<a href=\"/wik... \n",
|
||||
"\n",
|
||||
" file_urls \\\n",
|
||||
"87 [https:////upload.wikimedia.org/wikipedia/comm... \n",
|
||||
"\n",
|
||||
" files \n",
|
||||
"87 [{'url': 'https://upload.wikimedia.org/wikiped... "
|
||||
]
|
||||
},
|
||||
"execution_count": 212,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"countries[\n",
|
||||
" countries[\"short_country_name\"].map(lambda country: \"palestine\" in country.lower())\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 213,
|
||||
"id": "ab895aca-9ffe-4dcf-beb1-4ab9d314c82b",
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2022-06-25T23:47:39.784680Z",
|
||||
"iopub.status.busy": "2022-06-25T23:47:39.784136Z",
|
||||
"iopub.status.idle": "2022-06-25T23:47:39.791893Z",
|
||||
"shell.execute_reply": "2022-06-25T23:47:39.790751Z",
|
||||
"shell.execute_reply.started": "2022-06-25T23:47:39.784625Z"
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"root = etree.fromstring(countries[\"country_html\"].iloc[87], parser)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 262,
|
||||
"id": "e78c8eb9-a7d3-4617-b4fa-0ef1afd7bf29",
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2022-06-26T00:00:41.346966Z",
|
||||
"iopub.status.busy": "2022-06-26T00:00:41.346256Z",
|
||||
"iopub.status.idle": "2022-06-26T00:00:41.353815Z",
|
||||
"shell.execute_reply": "2022-06-26T00:00:41.352825Z",
|
||||
"shell.execute_reply.started": "2022-06-26T00:00:41.346905Z"
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Jerusalem\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for element in root.xpath(\n",
|
||||
" \"//th/div/ul/li[text() = 'Proclaimed capital']/following::td[1]//li[1]/a[1]\"\n",
|
||||
"):\n",
|
||||
" print(element.text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "df8d24f4-0a07-46fc-920d-21cb0d6472e6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "08e1bea0-6f04-4c61-98fb-90abd4750a94",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9d909d88-e086-41fe-a46f-1cd0a21ec625",
|
||||
@@ -1712,15 +1880,15 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 120,
|
||||
"execution_count": 271,
|
||||
"id": "9263180c-92ed-4f5f-a479-70af90599b75",
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2022-06-25T21:51:58.514700Z",
|
||||
"iopub.status.busy": "2022-06-25T21:51:58.514295Z",
|
||||
"iopub.status.idle": "2022-06-25T21:51:58.524855Z",
|
||||
"shell.execute_reply": "2022-06-25T21:51:58.524175Z",
|
||||
"shell.execute_reply.started": "2022-06-25T21:51:58.514671Z"
|
||||
"iopub.execute_input": "2022-06-26T00:05:59.447161Z",
|
||||
"iopub.status.busy": "2022-06-26T00:05:59.446696Z",
|
||||
"iopub.status.idle": "2022-06-26T00:05:59.469995Z",
|
||||
"shell.execute_reply": "2022-06-26T00:05:59.468232Z",
|
||||
"shell.execute_reply.started": "2022-06-26T00:05:59.447116Z"
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
@@ -1729,34 +1897,31 @@
|
||||
"def extract_capital_0(index: int, country_name: str, country_html: str):\n",
|
||||
" result = None\n",
|
||||
" root = etree.fromstring(country_html, parser)\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" # matches single capital\n",
|
||||
" for element in root.xpath(\"//th[text() = 'Capital']/following-sibling::td/a\"):\n",
|
||||
" result = {\"index\": index, \"country_name\": country_name, \"capital\": element.text}\n",
|
||||
"\n",
|
||||
" # matches multiple capitals with numbered footnote\n",
|
||||
" try:\n",
|
||||
" # matches multiple capitals\n",
|
||||
" if result is None:\n",
|
||||
" for element in root.xpath(\"//th[text() = 'Capital']/following-sibling::td\"):\n",
|
||||
" for capital in element:\n",
|
||||
" capital_filter = capital.xpath(\"ul//a\")\n",
|
||||
" branch_filter = capital.xpath(\"ul//li\")\n",
|
||||
"\n",
|
||||
" capital_filter = element.xpath(\"div//a\")\n",
|
||||
" branch_filter = element.xpath(\"div//ul/li/text()\")\n",
|
||||
" _capitals = []\n",
|
||||
" for item in capital_filter:\n",
|
||||
" if not isinstance(re.search(r\"\\d\", item.text), re.Match):\n",
|
||||
" _capitals.append(item.text)\n",
|
||||
" for match in branch_filter:\n",
|
||||
" branch = match.xpath(\"text()\")\n",
|
||||
" _capitals.append(re.search(r\"(?:\\()([^\\)]*)\", branch[0])[1])\n",
|
||||
" for root in capital_filter:\n",
|
||||
" if root.text is not None:\n",
|
||||
" _capitals.append(root.text)\n",
|
||||
" for root in branch_filter:\n",
|
||||
" if isinstance(\n",
|
||||
" res := re.search(r\"(?:\\()([^/)]*)\", str(root).strip()), re.Match\n",
|
||||
" ):\n",
|
||||
" _capitals.append(res[1])\n",
|
||||
" result = {\n",
|
||||
" \"index\": index,\n",
|
||||
" \"country_name\": country_name,\n",
|
||||
" \"capital\": _capitals,\n",
|
||||
" }\n",
|
||||
" except IndexError:\n",
|
||||
" result = None\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" # matches mutiple capitals with italic footnote\n",
|
||||
" if result is None:\n",
|
||||
" for element in root.xpath(\"//th[text() = 'Capital']/following-sibling::td\"):\n",
|
||||
@@ -1769,28 +1934,44 @@
|
||||
" if not isinstance(re.search(r\"\\d\", item.text), re.Match):\n",
|
||||
" _capitals.append(item.text)\n",
|
||||
" for match in branch_filter:\n",
|
||||
" if isinstance(re.search(r\"(?:i\\>)([^\\<)]*)\", etree.tostring(match).decode(\"utf-8\")), re.Match):\n",
|
||||
" if isinstance(\n",
|
||||
" re.search(\n",
|
||||
" r\"(?:i\\>)([^\\<)]*)\", etree.tostring(match).decode(\"utf-8\")\n",
|
||||
" ),\n",
|
||||
" re.Match,\n",
|
||||
" ):\n",
|
||||
" _capitals.append(match.text)\n",
|
||||
" result = {\n",
|
||||
" \"index\": 20,\n",
|
||||
" \"country_name\": countries[\"short_country_name\"].iloc[20],\n",
|
||||
" \"index\": index,\n",
|
||||
" \"country_name\": country_name,\n",
|
||||
" \"capital\": _capitals,\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" # proclaimed capitals: e.g Palestine\n",
|
||||
" if result is None:\n",
|
||||
" for element in root.xpath(\n",
|
||||
" \"//th/div/ul/li[text() = 'Proclaimed capital']/following::td[1]//li[1]/a[1]\"\n",
|
||||
" ):\n",
|
||||
" result = {\n",
|
||||
" \"index\": index,\n",
|
||||
" \"country_name\": country_name,\n",
|
||||
" \"capital\": element.text,\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" return result or None"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 128,
|
||||
"execution_count": 273,
|
||||
"id": "0b39d1c7-a1ee-4031-a0eb-93ab5ef525ff",
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2022-06-25T22:10:22.757906Z",
|
||||
"iopub.status.busy": "2022-06-25T22:10:22.757413Z",
|
||||
"iopub.status.idle": "2022-06-25T22:10:22.881155Z",
|
||||
"shell.execute_reply": "2022-06-25T22:10:22.880194Z",
|
||||
"shell.execute_reply.started": "2022-06-25T22:10:22.757877Z"
|
||||
"iopub.execute_input": "2022-06-26T00:06:10.855929Z",
|
||||
"iopub.status.busy": "2022-06-26T00:06:10.855004Z",
|
||||
"iopub.status.idle": "2022-06-26T00:06:11.012760Z",
|
||||
"shell.execute_reply": "2022-06-26T00:06:11.011838Z",
|
||||
"shell.execute_reply.started": "2022-06-26T00:06:10.855859Z"
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
@@ -1799,212 +1980,10 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'index': 0, 'country_name': 'Afghanistan', 'capital': None}\n",
|
||||
"{'index': 1, 'country_name': 'Croatia', 'capital': None}\n",
|
||||
"{'index': 2, 'country_name': 'Costa Rica', 'capital': None}\n",
|
||||
"{'index': 3, 'country_name': 'Democratic Republic of the Congo', 'capital': None}\n",
|
||||
"{'index': 4, 'country_name': 'Comoros', 'capital': None}\n",
|
||||
"{'index': 5, 'country_name': 'Republic of the Congo', 'capital': None}\n",
|
||||
"{'index': 6, 'country_name': 'China', 'capital': None}\n",
|
||||
"{'index': 7, 'country_name': 'Chile', 'capital': None}\n",
|
||||
"{'index': 8, 'country_name': 'Chad', 'capital': None}\n",
|
||||
"{'index': 9, 'country_name': 'Central African Republic', 'capital': None}\n",
|
||||
"{'index': 10, 'country_name': 'Cape Verde', 'capital': None}\n",
|
||||
"{'index': 11, 'country_name': 'Colombia', 'capital': None}\n",
|
||||
"{'index': 12, 'country_name': 'Cameroon', 'capital': None}\n",
|
||||
"{'index': 13, 'country_name': 'Cambodia', 'capital': None}\n",
|
||||
"{'index': 14, 'country_name': 'Burundi', 'capital': None}\n",
|
||||
"{'index': 15, 'country_name': 'Transnistria', 'capital': None}\n",
|
||||
"{'index': 16, 'country_name': 'Canada', 'capital': None}\n",
|
||||
"{'index': 17, 'country_name': 'Taiwan', 'capital': None}\n",
|
||||
"{'index': 18, 'country_name': 'South Ossetia', 'capital': None}\n",
|
||||
"{'index': 19, 'country_name': 'Somaliland', 'capital': None}\n",
|
||||
"{'index': 20, 'country_name': 'Sahrawi Arab Democratic Republic', 'capital': None}\n",
|
||||
"{'index': 21, 'country_name': 'Northern Cyprus', 'capital': None}\n",
|
||||
"{'index': 22, 'country_name': 'Niue', 'capital': None}\n",
|
||||
"{'index': 23, 'country_name': 'Kosovo', 'capital': None}\n",
|
||||
"{'index': 24, 'country_name': 'Cook Islands', 'capital': None}\n",
|
||||
"{'index': 25, 'country_name': 'Republic of Artsakh', 'capital': None}\n",
|
||||
"{'index': 26, 'country_name': 'Abkhazia', 'capital': None}\n",
|
||||
"{'index': 27, 'country_name': 'Zimbabwe', 'capital': None}\n",
|
||||
"{'index': 28, 'country_name': 'Zambia', 'capital': None}\n",
|
||||
"{'index': 29, 'country_name': 'Yemen', 'capital': None}\n",
|
||||
"{'index': 30, 'country_name': 'Vietnam', 'capital': None}\n",
|
||||
"{'index': 31, 'country_name': 'Venezuela', 'capital': None}\n",
|
||||
"{'index': 32, 'country_name': 'Luhansk People%27s Republic', 'capital': None}\n",
|
||||
"None\n",
|
||||
"{'index': 34, 'country_name': 'Vanuatu', 'capital': None}\n",
|
||||
"{'index': 35, 'country_name': 'Uzbekistan', 'capital': None}\n",
|
||||
"{'index': 36, 'country_name': 'Uruguay', 'capital': None}\n",
|
||||
"{'index': 37, 'country_name': 'United Kingdom', 'capital': None}\n",
|
||||
"{'index': 38, 'country_name': 'United States', 'capital': None}\n",
|
||||
"{'index': 39, 'country_name': 'United Arab Emirates', 'capital': None}\n",
|
||||
"{'index': 40, 'country_name': 'Uganda', 'capital': None}\n",
|
||||
"{'index': 41, 'country_name': 'Tuvalu', 'capital': None}\n",
|
||||
"{'index': 42, 'country_name': 'Tunisia', 'capital': None}\n",
|
||||
"{'index': 43, 'country_name': 'Turkmenistan', 'capital': None}\n",
|
||||
"{'index': 44, 'country_name': 'Tonga', 'capital': None}\n",
|
||||
"{'index': 45, 'country_name': 'Trinidad and Tobago', 'capital': None}\n",
|
||||
"{'index': 46, 'country_name': 'Togo', 'capital': None}\n",
|
||||
"{'index': 47, 'country_name': 'Ukraine', 'capital': None}\n",
|
||||
"{'index': 48, 'country_name': 'Thailand', 'capital': None}\n",
|
||||
"{'index': 49, 'country_name': 'Tanzania', 'capital': None}\n",
|
||||
"{'index': 50, 'country_name': 'Tajikistan', 'capital': None}\n",
|
||||
"{'index': 51, 'country_name': 'Syria', 'capital': None}\n",
|
||||
"{'index': 52, 'country_name': 'Switzerland', 'capital': None}\n",
|
||||
"{'index': 53, 'country_name': 'Sweden', 'capital': None}\n",
|
||||
"{'index': 54, 'country_name': 'Sudan', 'capital': None}\n",
|
||||
"{'index': 55, 'country_name': 'Suriname', 'capital': None}\n",
|
||||
"{'index': 56, 'country_name': 'Sri Lanka', 'capital': None}\n",
|
||||
"{'index': 57, 'country_name': 'Spain', 'capital': None}\n",
|
||||
"{'index': 58, 'country_name': 'South Africa', 'capital': None}\n",
|
||||
"{'index': 59, 'country_name': 'Somalia', 'capital': None}\n",
|
||||
"{'index': 60, 'country_name': 'Solomon Islands', 'capital': None}\n",
|
||||
"{'index': 61, 'country_name': 'Slovenia', 'capital': None}\n",
|
||||
"{'index': 62, 'country_name': 'South Sudan', 'capital': None}\n",
|
||||
"{'index': 63, 'country_name': 'Slovakia', 'capital': None}\n",
|
||||
"None\n",
|
||||
"{'index': 65, 'country_name': 'Sierra Leone', 'capital': None}\n",
|
||||
"{'index': 66, 'country_name': 'Seychelles', 'capital': None}\n",
|
||||
"{'index': 67, 'country_name': 'Serbia', 'capital': None}\n",
|
||||
"{'index': 68, 'country_name': 'Senegal', 'capital': None}\n",
|
||||
"{'index': 69, 'country_name': 'Saudi Arabia', 'capital': None}\n",
|
||||
"{'index': 70, 'country_name': 'S%C3%A3o Tom%C3%A9 and Pr%C3%ADncipe', 'capital': None}\n",
|
||||
"{'index': 71, 'country_name': 'San Marino', 'capital': None}\n",
|
||||
"{'index': 72, 'country_name': 'Samoa', 'capital': None}\n",
|
||||
"{'index': 73, 'country_name': 'Saint Vincent and the Grenadines', 'capital': None}\n",
|
||||
"{'index': 74, 'country_name': 'Saint Lucia', 'capital': None}\n",
|
||||
"{'index': 75, 'country_name': 'Rwanda', 'capital': None}\n",
|
||||
"{'index': 76, 'country_name': 'Romania', 'capital': None}\n",
|
||||
"{'index': 77, 'country_name': 'Portugal', 'capital': None}\n",
|
||||
"{'index': 78, 'country_name': 'Poland', 'capital': None}\n",
|
||||
"{'index': 79, 'country_name': 'Saint Kitts and Nevis', 'capital': None}\n",
|
||||
"{'index': 80, 'country_name': 'Philippines', 'capital': None}\n",
|
||||
"{'index': 81, 'country_name': 'Qatar', 'capital': None}\n",
|
||||
"{'index': 82, 'country_name': 'Russia', 'capital': None}\n",
|
||||
"{'index': 83, 'country_name': 'Peru', 'capital': None}\n",
|
||||
"{'index': 84, 'country_name': 'Papua New Guinea', 'capital': None}\n",
|
||||
"{'index': 85, 'country_name': 'Paraguay', 'capital': None}\n",
|
||||
"{'index': 86, 'country_name': 'Panama', 'capital': None}\n",
|
||||
"None\n",
|
||||
"{'index': 88, 'country_name': 'Palau', 'capital': None}\n",
|
||||
"{'index': 89, 'country_name': 'Oman', 'capital': None}\n",
|
||||
"{'index': 90, 'country_name': 'North Macedonia', 'capital': None}\n",
|
||||
"{'index': 91, 'country_name': 'Norway', 'capital': None}\n",
|
||||
"{'index': 92, 'country_name': 'Pakistan', 'capital': None}\n",
|
||||
"{'index': 93, 'country_name': 'Nigeria', 'capital': None}\n",
|
||||
"{'index': 94, 'country_name': 'Niger', 'capital': None}\n",
|
||||
"{'index': 95, 'country_name': 'Nicaragua', 'capital': None}\n",
|
||||
"{'index': 96, 'country_name': 'Kingdom of the Netherlands', 'capital': None}\n",
|
||||
"{'index': 97, 'country_name': 'Nepal', 'capital': None}\n",
|
||||
"{'index': 98, 'country_name': 'Nauru', 'capital': None}\n",
|
||||
"{'index': 99, 'country_name': 'New Zealand', 'capital': None}\n",
|
||||
"{'index': 100, 'country_name': 'Namibia', 'capital': None}\n",
|
||||
"{'index': 101, 'country_name': 'Myanmar', 'capital': None}\n",
|
||||
"{'index': 102, 'country_name': 'Mozambique', 'capital': None}\n",
|
||||
"{'index': 103, 'country_name': 'Morocco', 'capital': None}\n",
|
||||
"{'index': 104, 'country_name': 'Montenegro', 'capital': None}\n",
|
||||
"{'index': 105, 'country_name': 'Monaco', 'capital': None}\n",
|
||||
"{'index': 106, 'country_name': 'Moldova', 'capital': None}\n",
|
||||
"{'index': 107, 'country_name': 'Federated States of Micronesia', 'capital': None}\n",
|
||||
"{'index': 108, 'country_name': 'Mexico', 'capital': None}\n",
|
||||
"{'index': 109, 'country_name': 'Mongolia', 'capital': None}\n",
|
||||
"{'index': 110, 'country_name': 'Mauritius', 'capital': None}\n",
|
||||
"{'index': 111, 'country_name': 'Mauritania', 'capital': None}\n",
|
||||
"{'index': 112, 'country_name': 'Marshall Islands', 'capital': None}\n",
|
||||
"{'index': 113, 'country_name': 'Malta', 'capital': None}\n",
|
||||
"{'index': 114, 'country_name': 'Mali', 'capital': None}\n",
|
||||
"{'index': 115, 'country_name': 'Maldives', 'capital': None}\n",
|
||||
"{'index': 116, 'country_name': 'Malaysia', 'capital': None}\n",
|
||||
"{'index': 117, 'country_name': 'Malawi', 'capital': None}\n",
|
||||
"{'index': 118, 'country_name': 'Madagascar', 'capital': None}\n",
|
||||
"{'index': 119, 'country_name': 'Luxembourg', 'capital': None}\n",
|
||||
"{'index': 120, 'country_name': 'Lithuania', 'capital': None}\n",
|
||||
"{'index': 121, 'country_name': 'Liechtenstein', 'capital': None}\n",
|
||||
"{'index': 122, 'country_name': 'Libya', 'capital': None}\n",
|
||||
"{'index': 123, 'country_name': 'Liberia', 'capital': None}\n",
|
||||
"{'index': 124, 'country_name': 'Lesotho', 'capital': None}\n",
|
||||
"{'index': 125, 'country_name': 'Lebanon', 'capital': None}\n",
|
||||
"{'index': 126, 'country_name': 'Latvia', 'capital': None}\n",
|
||||
"{'index': 127, 'country_name': 'Laos', 'capital': None}\n",
|
||||
"{'index': 128, 'country_name': 'Kyrgyzstan', 'capital': None}\n",
|
||||
"{'index': 129, 'country_name': 'Kuwait', 'capital': None}\n",
|
||||
"{'index': 130, 'country_name': 'South Korea', 'capital': None}\n",
|
||||
"{'index': 131, 'country_name': 'North Korea', 'capital': None}\n",
|
||||
"{'index': 132, 'country_name': 'Kiribati', 'capital': None}\n",
|
||||
"{'index': 133, 'country_name': 'Kenya', 'capital': None}\n",
|
||||
"{'index': 134, 'country_name': 'Kazakhstan', 'capital': None}\n",
|
||||
"{'index': 135, 'country_name': 'Jordan', 'capital': None}\n",
|
||||
"{'index': 136, 'country_name': 'Japan', 'capital': None}\n",
|
||||
"{'index': 137, 'country_name': 'Jamaica', 'capital': None}\n",
|
||||
"{'index': 138, 'country_name': 'Ivory Coast', 'capital': None}\n",
|
||||
"{'index': 139, 'country_name': 'Italy', 'capital': None}\n",
|
||||
"{'index': 140, 'country_name': 'Israel', 'capital': None}\n",
|
||||
"{'index': 141, 'country_name': 'Republic of Ireland', 'capital': None}\n",
|
||||
"{'index': 142, 'country_name': 'Iraq', 'capital': None}\n",
|
||||
"{'index': 143, 'country_name': 'Iran', 'capital': None}\n",
|
||||
"{'index': 144, 'country_name': 'Indonesia', 'capital': None}\n",
|
||||
"{'index': 145, 'country_name': 'India', 'capital': None}\n",
|
||||
"{'index': 146, 'country_name': 'Iceland', 'capital': None}\n",
|
||||
"{'index': 147, 'country_name': 'Hungary', 'capital': None}\n",
|
||||
"{'index': 148, 'country_name': 'Honduras', 'capital': None}\n",
|
||||
"{'index': 149, 'country_name': 'Guyana', 'capital': None}\n",
|
||||
"{'index': 150, 'country_name': 'Haiti', 'capital': None}\n",
|
||||
"{'index': 151, 'country_name': 'Guinea-Bissau', 'capital': None}\n",
|
||||
"{'index': 152, 'country_name': 'Guinea', 'capital': None}\n",
|
||||
"{'index': 153, 'country_name': 'Guatemala', 'capital': None}\n",
|
||||
"{'index': 154, 'country_name': 'Grenada', 'capital': None}\n",
|
||||
"{'index': 155, 'country_name': 'Greece', 'capital': None}\n",
|
||||
"{'index': 156, 'country_name': 'Ghana', 'capital': None}\n",
|
||||
"{'index': 157, 'country_name': 'Germany', 'capital': None}\n",
|
||||
"{'index': 158, 'country_name': 'Georgia (country)', 'capital': None}\n",
|
||||
"{'index': 159, 'country_name': 'The Gambia', 'capital': None}\n",
|
||||
"{'index': 160, 'country_name': 'Gabon', 'capital': None}\n",
|
||||
"{'index': 161, 'country_name': 'Finland', 'capital': None}\n",
|
||||
"{'index': 162, 'country_name': 'Fiji', 'capital': None}\n",
|
||||
"{'index': 163, 'country_name': 'Ethiopia', 'capital': None}\n",
|
||||
"{'index': 164, 'country_name': 'Eswatini', 'capital': None}\n",
|
||||
"{'index': 165, 'country_name': 'Estonia', 'capital': None}\n",
|
||||
"{'index': 166, 'country_name': 'France', 'capital': None}\n",
|
||||
"{'index': 167, 'country_name': 'Eritrea', 'capital': None}\n",
|
||||
"{'index': 168, 'country_name': 'Equatorial Guinea', 'capital': None}\n",
|
||||
"{'index': 169, 'country_name': 'El Salvador', 'capital': None}\n",
|
||||
"{'index': 170, 'country_name': 'Egypt', 'capital': None}\n",
|
||||
"{'index': 171, 'country_name': 'Ecuador', 'capital': None}\n",
|
||||
"{'index': 172, 'country_name': 'East Timor', 'capital': None}\n",
|
||||
"{'index': 173, 'country_name': 'Dominica', 'capital': None}\n",
|
||||
"{'index': 174, 'country_name': 'Dominican Republic', 'capital': None}\n",
|
||||
"{'index': 175, 'country_name': 'Djibouti', 'capital': None}\n",
|
||||
"{'index': 176, 'country_name': 'Danish Realm', 'capital': None}\n",
|
||||
"{'index': 177, 'country_name': 'Czech Republic', 'capital': None}\n",
|
||||
"{'index': 178, 'country_name': 'Cyprus', 'capital': None}\n",
|
||||
"{'index': 179, 'country_name': 'Cuba', 'capital': None}\n",
|
||||
"{'index': 180, 'country_name': 'Burkina Faso', 'capital': None}\n",
|
||||
"{'index': 181, 'country_name': 'Bulgaria', 'capital': None}\n",
|
||||
"{'index': 182, 'country_name': 'Brunei', 'capital': None}\n",
|
||||
"{'index': 183, 'country_name': 'Brazil', 'capital': None}\n",
|
||||
"{'index': 184, 'country_name': 'Botswana', 'capital': None}\n",
|
||||
"{'index': 185, 'country_name': 'Bosnia and Herzegovina', 'capital': None}\n",
|
||||
"{'index': 186, 'country_name': 'Bolivia', 'capital': None}\n",
|
||||
"{'index': 187, 'country_name': 'Bhutan', 'capital': None}\n",
|
||||
"{'index': 188, 'country_name': 'Benin', 'capital': None}\n",
|
||||
"{'index': 189, 'country_name': 'Belize', 'capital': None}\n",
|
||||
"{'index': 190, 'country_name': 'Belgium', 'capital': None}\n",
|
||||
"{'index': 191, 'country_name': 'Belarus', 'capital': None}\n",
|
||||
"{'index': 192, 'country_name': 'Barbados', 'capital': None}\n",
|
||||
"{'index': 193, 'country_name': 'Bangladesh', 'capital': None}\n",
|
||||
"{'index': 194, 'country_name': 'Bahrain', 'capital': None}\n",
|
||||
"{'index': 195, 'country_name': 'The Bahamas', 'capital': None}\n",
|
||||
"{'index': 196, 'country_name': 'Azerbaijan', 'capital': None}\n",
|
||||
"{'index': 197, 'country_name': 'Austria', 'capital': None}\n",
|
||||
"{'index': 198, 'country_name': 'Australia', 'capital': None}\n",
|
||||
"{'index': 199, 'country_name': 'Armenia', 'capital': None}\n",
|
||||
"{'index': 200, 'country_name': 'Argentina', 'capital': None}\n",
|
||||
"{'index': 201, 'country_name': 'Antigua and Barbuda', 'capital': '\\n'}\n",
|
||||
"{'index': 202, 'country_name': 'Angola', 'capital': None}\n",
|
||||
"{'index': 203, 'country_name': 'Andorra', 'capital': None}\n",
|
||||
"{'index': 204, 'country_name': 'Algeria', 'capital': None}\n",
|
||||
"{'index': 205, 'country_name': 'Albania', 'capital': None}\n"
|
||||
"{'index': 23, 'country_name': 'Kosovo', 'capital': []}\n",
|
||||
"33 Vatican City\n",
|
||||
"64 Singapore\n",
|
||||
"{'index': 201, 'country_name': 'Antigua and Barbuda', 'capital': []}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -2012,8 +1991,14 @@
|
||||
"for index, country_name, country_html in zip(\n",
|
||||
" countries.index, countries[\"short_country_name\"], countries[\"country_html\"]\n",
|
||||
"):\n",
|
||||
" # print(json.dumps(extract_capital_0(index, country_name, country_html)))\n",
|
||||
" print(extract_capital_0(index, country_name, country_html))"
|
||||
" result = extract_capital_0(index, country_name, country_html)\n",
|
||||
" try:\n",
|
||||
" if len(result[\"capital\"]) == 0:\n",
|
||||
" print(result)\n",
|
||||
" except TypeError:\n",
|
||||
" print(index, country_name)\n",
|
||||
"\n",
|
||||
" # print(json.dumps(extract_capital_0(index, country_name, country_html)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -2023,106 +2008,6 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4cc8e0f7-36ab-41b8-8ce9-d0ebe02a9896",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "914e0498-c75d-4c40-8320-ec62f5db8764",
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2022-06-25T20:16:19.646094Z",
|
||||
"iopub.status.busy": "2022-06-25T20:16:19.645699Z",
|
||||
"iopub.status.idle": "2022-06-25T20:16:19.649925Z",
|
||||
"shell.execute_reply": "2022-06-25T20:16:19.648982Z",
|
||||
"shell.execute_reply.started": "2022-06-25T20:16:19.646064Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## Playground"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 126,
|
||||
"id": "57d8b5a9-dd20-4214-866a-b57ecc7ad5da",
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2022-06-25T22:09:14.820234Z",
|
||||
"iopub.status.busy": "2022-06-25T22:09:14.819573Z",
|
||||
"iopub.status.idle": "2022-06-25T22:09:14.830451Z",
|
||||
"shell.execute_reply": "2022-06-25T22:09:14.829525Z",
|
||||
"shell.execute_reply.started": "2022-06-25T22:09:14.820174Z"
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"country_url https://en.wikipedia.org/wiki/Malaysia\n",
|
||||
"flag_description_url https://en.wikipedia.org/wiki/Flag_of_Malaysia\n",
|
||||
"short_country_name Malaysia\n",
|
||||
"country_html <div><tr><th colspan=\"2\" class=\"infobox-above ...\n",
|
||||
"flag_html <p>The <a href=\"/wiki/National_flag\" title=\"Na...\n",
|
||||
"file_urls [https:////upload.wikimedia.org/wikipedia/comm...\n",
|
||||
"files [{'url': 'https://upload.wikimedia.org/wikiped...\n",
|
||||
"Name: 116, dtype: object"
|
||||
]
|
||||
},
|
||||
"execution_count": 126,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"countries.iloc[116]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a2f9b1bf-e60a-44d0-9a77-bdc44daeb838",
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2022-06-25T22:17:26.496436Z",
|
||||
"iopub.status.busy": "2022-06-25T22:17:26.494903Z"
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "TypeError",
|
||||
"evalue": "expected string or bytes-like object",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
|
||||
"Input \u001b[0;32mIn [131]\u001b[0m, in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m index \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m116\u001b[39m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mextract_capital_0\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcountries\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43miloc\u001b[49m\u001b[43m[\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mshort_country_name\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcountries\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43miloc\u001b[49m\u001b[43m[\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcountry_html\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"Input \u001b[0;32mIn [120]\u001b[0m, in \u001b[0;36mextract_capital_0\u001b[0;34m(index, country_name, country_html)\u001b[0m\n\u001b[1;32m 17\u001b[0m _capitals \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m capital_filter:\n\u001b[0;32m---> 19\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[43mre\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43md\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m, re\u001b[38;5;241m.\u001b[39mMatch):\n\u001b[1;32m 20\u001b[0m _capitals\u001b[38;5;241m.\u001b[39mappend(item\u001b[38;5;241m.\u001b[39mtext)\n\u001b[1;32m 21\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m match \u001b[38;5;129;01min\u001b[39;00m branch_filter:\n",
|
||||
"File \u001b[0;32m~/.pyenv/versions/3.8.12/lib/python3.8/re.py:201\u001b[0m, in \u001b[0;36msearch\u001b[0;34m(pattern, string, flags)\u001b[0m\n\u001b[1;32m 198\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msearch\u001b[39m(pattern, string, flags\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m):\n\u001b[1;32m 199\u001b[0m \u001b[38;5;124;03m\"\"\"Scan through string looking for a match to the pattern, returning\u001b[39;00m\n\u001b[1;32m 200\u001b[0m \u001b[38;5;124;03m a Match object, or None if no match was found.\"\"\"\u001b[39;00m\n\u001b[0;32m--> 201\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_compile\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpattern\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mflags\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstring\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"\u001b[0;31mTypeError\u001b[0m: expected string or bytes-like object"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"index = 116\n",
|
||||
"extract_capital_0(index, countries.iloc[index][\"short_country_name\"], countries.iloc[index][\"country_html\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e066fde4-b290-49d7-80d2-64e6003efcec",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
Reference in New Issue
Block a user