diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py index c578484..f06a812 100644 --- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py +++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py @@ -4,6 +4,7 @@ # https://docs.scrapy.org/en/latest/topics/items.html import scrapy +from wikipedia_country_scraper.items import WikipediaCountryScraperItem class WikipediaCountryScraperItem(scrapy.Item): @@ -24,3 +25,8 @@ class AnthemsItem(scrapy.Item): file_urls = scrapy.Field() files = scrapy.Field() + + +class CapitalsItem(scrapy.Item): + country_name = scrapy.Field() + capitals = scrapy.Field() diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/capitals.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/capitals.py new file mode 100644 index 0000000..15df8f3 --- /dev/null +++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/capitals.py @@ -0,0 +1,37 @@ +import scrapy +from scrapy.http import TextResponse +from wikipedia_country_scraper.items import CapitalsItem + + +class CapitalsSpider(scrapy.Spider): + name = "capitals" + start_urls = [ + "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_and_their_capitals_in_native_languages" + ] + custom_settings = { + "FEEDS": { + pathlib.Path(__file__).resolve().parents[4] + / "data" + / "scrapy" + / "raw_country_data" + / "capitals.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2} + }, + } + + def parse(self, response: TextResponse): + _country = response.xpath("//table[@class='wikitable']/tbody/tr[not(@style='background:#ccc;')]") + + country_names = [country_name.get() for country_name in _country.xpath("td[1]//a[@title]/text()")] + + capital_names = [] + for capital in _country.xpath("td[2]"): + _capitals = [subcapital.get() for subcapital in capital.xpath("a[@title]/text()")] + capital_names.append(_capitals) + + for country, capitals in zip(country_names, capital_names): + capital_item = CapitalsItem() + + capital_item["country_name"] = country + capital_item["capitals"] = capitals + + yield capital_item diff --git a/playground/downloaded_data_inspection_lab/capital_xpath.ipynb b/playground/downloaded_data_inspection_lab/capital_xpath.ipynb index 4bed7e5..ec411ef 100644 --- a/playground/downloaded_data_inspection_lab/capital_xpath.ipynb +++ b/playground/downloaded_data_inspection_lab/capital_xpath.ipynb @@ -1561,7 +1561,9 @@ } ], "source": [ - "countries[countries[\"short_country_name\"].map(lambda country: \"arab\" in country.lower())]" + "countries[\n", + " countries[\"short_country_name\"].map(lambda country: \"arab\" in country.lower())\n", + "]" ] }, { @@ -1661,7 +1663,10 @@ " for match in branch_filter:\n", " print(etree.tostring(match))\n", " print(\"match\", match.text)\n", - " if isinstance(re.search(r\"(?:i\\>)([^\\<)]*)\", etree.tostring(match).decode(\"utf-8\")), re.Match):\n", + " if isinstance(\n", + " re.search(r\"(?:i\\>)([^\\<)]*)\", etree.tostring(match).decode(\"utf-8\")),\n", + " re.Match,\n", + " ):\n", " _capitals.append(match.text)\n", " result = {\n", " \"index\": 20,\n", @@ -1702,6 +1707,169 @@ "result" ] }, + { + "cell_type": "markdown", + "id": "7dbe415a-ba82-4813-9723-ac66ec9b29aa", + "metadata": {}, + "source": [ + "#### State of Palestine" + ] + }, + { + "cell_type": "code", + "execution_count": 212, + "id": "b39a6451-19e0-4ec6-a925-89bcdb89c441", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T23:47:35.380713Z", + "iopub.status.busy": "2022-06-25T23:47:35.380097Z", + "iopub.status.idle": "2022-06-25T23:47:35.485628Z", + "shell.execute_reply": "2022-06-25T23:47:35.484621Z", + "shell.execute_reply.started": "2022-06-25T23:47:35.380654Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
country_urlflag_description_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
87https://en.wikipedia.org/wiki/State_of_Palestinehttps://en.wikipedia.org/wiki/Flag_of_PalestineState of Palestine<div><tr><th colspan=\"2\" class=\"infobox-above ...<p>The <b>flag of Palestine</b> (<a href=\"/wik...[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
\n", + "
" + ], + "text/plain": [ + " country_url \\\n", + "87 https://en.wikipedia.org/wiki/State_of_Palestine \n", + "\n", + " flag_description_url short_country_name \\\n", + "87 https://en.wikipedia.org/wiki/Flag_of_Palestine State of Palestine \n", + "\n", + " country_html \\\n", + "87
The flag of Palestine ()([^\\<)]*)\", etree.tostring(match).decode(\"utf-8\")), re.Match):\n", + " if isinstance(\n", + " re.search(\n", + " r\"(?:i\\>)([^\\<)]*)\", etree.tostring(match).decode(\"utf-8\")\n", + " ),\n", + " re.Match,\n", + " ):\n", " _capitals.append(match.text)\n", " result = {\n", - " \"index\": 20,\n", - " \"country_name\": countries[\"short_country_name\"].iloc[20],\n", + " \"index\": index,\n", + " \"country_name\": country_name,\n", " \"capital\": _capitals,\n", " }\n", "\n", + " # proclaimed capitals: e.g Palestine\n", + " if result is None:\n", + " for element in root.xpath(\n", + " \"//th/div/ul/li[text() = 'Proclaimed capital']/following::td[1]//li[1]/a[1]\"\n", + " ):\n", + " result = {\n", + " \"index\": index,\n", + " \"country_name\": country_name,\n", + " \"capital\": element.text,\n", + " }\n", + "\n", " return result or None" ] }, { "cell_type": "code", - "execution_count": 128, + "execution_count": 273, "id": "0b39d1c7-a1ee-4031-a0eb-93ab5ef525ff", "metadata": { "execution": { - "iopub.execute_input": "2022-06-25T22:10:22.757906Z", - "iopub.status.busy": "2022-06-25T22:10:22.757413Z", - "iopub.status.idle": "2022-06-25T22:10:22.881155Z", - "shell.execute_reply": "2022-06-25T22:10:22.880194Z", - "shell.execute_reply.started": "2022-06-25T22:10:22.757877Z" + "iopub.execute_input": "2022-06-26T00:06:10.855929Z", + "iopub.status.busy": "2022-06-26T00:06:10.855004Z", + "iopub.status.idle": "2022-06-26T00:06:11.012760Z", + "shell.execute_reply": "2022-06-26T00:06:11.011838Z", + "shell.execute_reply.started": "2022-06-26T00:06:10.855859Z" }, "tags": [] }, @@ -1799,212 +1980,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'index': 0, 'country_name': 'Afghanistan', 'capital': None}\n", - "{'index': 1, 'country_name': 'Croatia', 'capital': None}\n", - "{'index': 2, 'country_name': 'Costa Rica', 'capital': None}\n", - "{'index': 3, 'country_name': 'Democratic Republic of the Congo', 'capital': None}\n", - "{'index': 4, 'country_name': 'Comoros', 'capital': None}\n", - "{'index': 5, 'country_name': 'Republic of the Congo', 'capital': None}\n", - "{'index': 6, 'country_name': 'China', 'capital': None}\n", - "{'index': 7, 'country_name': 'Chile', 'capital': None}\n", - "{'index': 8, 'country_name': 'Chad', 'capital': None}\n", - "{'index': 9, 'country_name': 'Central African Republic', 'capital': None}\n", - "{'index': 10, 'country_name': 'Cape Verde', 'capital': None}\n", - "{'index': 11, 'country_name': 'Colombia', 'capital': None}\n", - "{'index': 12, 'country_name': 'Cameroon', 'capital': None}\n", - "{'index': 13, 'country_name': 'Cambodia', 'capital': None}\n", - "{'index': 14, 'country_name': 'Burundi', 'capital': None}\n", - "{'index': 15, 'country_name': 'Transnistria', 'capital': None}\n", - "{'index': 16, 'country_name': 'Canada', 'capital': None}\n", - "{'index': 17, 'country_name': 'Taiwan', 'capital': None}\n", - "{'index': 18, 'country_name': 'South Ossetia', 'capital': None}\n", - "{'index': 19, 'country_name': 'Somaliland', 'capital': None}\n", - "{'index': 20, 'country_name': 'Sahrawi Arab Democratic Republic', 'capital': None}\n", - "{'index': 21, 'country_name': 'Northern Cyprus', 'capital': None}\n", - "{'index': 22, 'country_name': 'Niue', 'capital': None}\n", - "{'index': 23, 'country_name': 'Kosovo', 'capital': None}\n", - "{'index': 24, 'country_name': 'Cook Islands', 'capital': None}\n", - "{'index': 25, 'country_name': 'Republic of Artsakh', 'capital': None}\n", - "{'index': 26, 'country_name': 'Abkhazia', 'capital': None}\n", - "{'index': 27, 'country_name': 'Zimbabwe', 'capital': None}\n", - "{'index': 28, 'country_name': 'Zambia', 'capital': None}\n", - "{'index': 29, 'country_name': 'Yemen', 'capital': None}\n", - "{'index': 30, 'country_name': 'Vietnam', 'capital': None}\n", - "{'index': 31, 'country_name': 'Venezuela', 'capital': None}\n", - "{'index': 32, 'country_name': 'Luhansk People%27s Republic', 'capital': None}\n", - "None\n", - "{'index': 34, 'country_name': 'Vanuatu', 'capital': None}\n", - "{'index': 35, 'country_name': 'Uzbekistan', 'capital': None}\n", - "{'index': 36, 'country_name': 'Uruguay', 'capital': None}\n", - "{'index': 37, 'country_name': 'United Kingdom', 'capital': None}\n", - "{'index': 38, 'country_name': 'United States', 'capital': None}\n", - "{'index': 39, 'country_name': 'United Arab Emirates', 'capital': None}\n", - "{'index': 40, 'country_name': 'Uganda', 'capital': None}\n", - "{'index': 41, 'country_name': 'Tuvalu', 'capital': None}\n", - "{'index': 42, 'country_name': 'Tunisia', 'capital': None}\n", - "{'index': 43, 'country_name': 'Turkmenistan', 'capital': None}\n", - "{'index': 44, 'country_name': 'Tonga', 'capital': None}\n", - "{'index': 45, 'country_name': 'Trinidad and Tobago', 'capital': None}\n", - "{'index': 46, 'country_name': 'Togo', 'capital': None}\n", - "{'index': 47, 'country_name': 'Ukraine', 'capital': None}\n", - "{'index': 48, 'country_name': 'Thailand', 'capital': None}\n", - "{'index': 49, 'country_name': 'Tanzania', 'capital': None}\n", - "{'index': 50, 'country_name': 'Tajikistan', 'capital': None}\n", - "{'index': 51, 'country_name': 'Syria', 'capital': None}\n", - "{'index': 52, 'country_name': 'Switzerland', 'capital': None}\n", - "{'index': 53, 'country_name': 'Sweden', 'capital': None}\n", - "{'index': 54, 'country_name': 'Sudan', 'capital': None}\n", - "{'index': 55, 'country_name': 'Suriname', 'capital': None}\n", - "{'index': 56, 'country_name': 'Sri Lanka', 'capital': None}\n", - "{'index': 57, 'country_name': 'Spain', 'capital': None}\n", - "{'index': 58, 'country_name': 'South Africa', 'capital': None}\n", - "{'index': 59, 'country_name': 'Somalia', 'capital': None}\n", - "{'index': 60, 'country_name': 'Solomon Islands', 'capital': None}\n", - "{'index': 61, 'country_name': 'Slovenia', 'capital': None}\n", - "{'index': 62, 'country_name': 'South Sudan', 'capital': None}\n", - "{'index': 63, 'country_name': 'Slovakia', 'capital': None}\n", - "None\n", - "{'index': 65, 'country_name': 'Sierra Leone', 'capital': None}\n", - "{'index': 66, 'country_name': 'Seychelles', 'capital': None}\n", - "{'index': 67, 'country_name': 'Serbia', 'capital': None}\n", - "{'index': 68, 'country_name': 'Senegal', 'capital': None}\n", - "{'index': 69, 'country_name': 'Saudi Arabia', 'capital': None}\n", - "{'index': 70, 'country_name': 'S%C3%A3o Tom%C3%A9 and Pr%C3%ADncipe', 'capital': None}\n", - "{'index': 71, 'country_name': 'San Marino', 'capital': None}\n", - "{'index': 72, 'country_name': 'Samoa', 'capital': None}\n", - "{'index': 73, 'country_name': 'Saint Vincent and the Grenadines', 'capital': None}\n", - "{'index': 74, 'country_name': 'Saint Lucia', 'capital': None}\n", - "{'index': 75, 'country_name': 'Rwanda', 'capital': None}\n", - "{'index': 76, 'country_name': 'Romania', 'capital': None}\n", - "{'index': 77, 'country_name': 'Portugal', 'capital': None}\n", - "{'index': 78, 'country_name': 'Poland', 'capital': None}\n", - "{'index': 79, 'country_name': 'Saint Kitts and Nevis', 'capital': None}\n", - "{'index': 80, 'country_name': 'Philippines', 'capital': None}\n", - "{'index': 81, 'country_name': 'Qatar', 'capital': None}\n", - "{'index': 82, 'country_name': 'Russia', 'capital': None}\n", - "{'index': 83, 'country_name': 'Peru', 'capital': None}\n", - "{'index': 84, 'country_name': 'Papua New Guinea', 'capital': None}\n", - "{'index': 85, 'country_name': 'Paraguay', 'capital': None}\n", - "{'index': 86, 'country_name': 'Panama', 'capital': None}\n", - "None\n", - "{'index': 88, 'country_name': 'Palau', 'capital': None}\n", - "{'index': 89, 'country_name': 'Oman', 'capital': None}\n", - "{'index': 90, 'country_name': 'North Macedonia', 'capital': None}\n", - "{'index': 91, 'country_name': 'Norway', 'capital': None}\n", - "{'index': 92, 'country_name': 'Pakistan', 'capital': None}\n", - "{'index': 93, 'country_name': 'Nigeria', 'capital': None}\n", - "{'index': 94, 'country_name': 'Niger', 'capital': None}\n", - "{'index': 95, 'country_name': 'Nicaragua', 'capital': None}\n", - "{'index': 96, 'country_name': 'Kingdom of the Netherlands', 'capital': None}\n", - "{'index': 97, 'country_name': 'Nepal', 'capital': None}\n", - "{'index': 98, 'country_name': 'Nauru', 'capital': None}\n", - "{'index': 99, 'country_name': 'New Zealand', 'capital': None}\n", - "{'index': 100, 'country_name': 'Namibia', 'capital': None}\n", - "{'index': 101, 'country_name': 'Myanmar', 'capital': None}\n", - "{'index': 102, 'country_name': 'Mozambique', 'capital': None}\n", - "{'index': 103, 'country_name': 'Morocco', 'capital': None}\n", - "{'index': 104, 'country_name': 'Montenegro', 'capital': None}\n", - "{'index': 105, 'country_name': 'Monaco', 'capital': None}\n", - "{'index': 106, 'country_name': 'Moldova', 'capital': None}\n", - "{'index': 107, 'country_name': 'Federated States of Micronesia', 'capital': None}\n", - "{'index': 108, 'country_name': 'Mexico', 'capital': None}\n", - "{'index': 109, 'country_name': 'Mongolia', 'capital': None}\n", - "{'index': 110, 'country_name': 'Mauritius', 'capital': None}\n", - "{'index': 111, 'country_name': 'Mauritania', 'capital': None}\n", - "{'index': 112, 'country_name': 'Marshall Islands', 'capital': None}\n", - "{'index': 113, 'country_name': 'Malta', 'capital': None}\n", - "{'index': 114, 'country_name': 'Mali', 'capital': None}\n", - "{'index': 115, 'country_name': 'Maldives', 'capital': None}\n", - "{'index': 116, 'country_name': 'Malaysia', 'capital': None}\n", - "{'index': 117, 'country_name': 'Malawi', 'capital': None}\n", - "{'index': 118, 'country_name': 'Madagascar', 'capital': None}\n", - "{'index': 119, 'country_name': 'Luxembourg', 'capital': None}\n", - "{'index': 120, 'country_name': 'Lithuania', 'capital': None}\n", - "{'index': 121, 'country_name': 'Liechtenstein', 'capital': None}\n", - "{'index': 122, 'country_name': 'Libya', 'capital': None}\n", - "{'index': 123, 'country_name': 'Liberia', 'capital': None}\n", - "{'index': 124, 'country_name': 'Lesotho', 'capital': None}\n", - "{'index': 125, 'country_name': 'Lebanon', 'capital': None}\n", - "{'index': 126, 'country_name': 'Latvia', 'capital': None}\n", - "{'index': 127, 'country_name': 'Laos', 'capital': None}\n", - "{'index': 128, 'country_name': 'Kyrgyzstan', 'capital': None}\n", - "{'index': 129, 'country_name': 'Kuwait', 'capital': None}\n", - "{'index': 130, 'country_name': 'South Korea', 'capital': None}\n", - "{'index': 131, 'country_name': 'North Korea', 'capital': None}\n", - "{'index': 132, 'country_name': 'Kiribati', 'capital': None}\n", - "{'index': 133, 'country_name': 'Kenya', 'capital': None}\n", - "{'index': 134, 'country_name': 'Kazakhstan', 'capital': None}\n", - "{'index': 135, 'country_name': 'Jordan', 'capital': None}\n", - "{'index': 136, 'country_name': 'Japan', 'capital': None}\n", - "{'index': 137, 'country_name': 'Jamaica', 'capital': None}\n", - "{'index': 138, 'country_name': 'Ivory Coast', 'capital': None}\n", - "{'index': 139, 'country_name': 'Italy', 'capital': None}\n", - "{'index': 140, 'country_name': 'Israel', 'capital': None}\n", - "{'index': 141, 'country_name': 'Republic of Ireland', 'capital': None}\n", - "{'index': 142, 'country_name': 'Iraq', 'capital': None}\n", - "{'index': 143, 'country_name': 'Iran', 'capital': None}\n", - "{'index': 144, 'country_name': 'Indonesia', 'capital': None}\n", - "{'index': 145, 'country_name': 'India', 'capital': None}\n", - "{'index': 146, 'country_name': 'Iceland', 'capital': None}\n", - "{'index': 147, 'country_name': 'Hungary', 'capital': None}\n", - "{'index': 148, 'country_name': 'Honduras', 'capital': None}\n", - "{'index': 149, 'country_name': 'Guyana', 'capital': None}\n", - "{'index': 150, 'country_name': 'Haiti', 'capital': None}\n", - "{'index': 151, 'country_name': 'Guinea-Bissau', 'capital': None}\n", - "{'index': 152, 'country_name': 'Guinea', 'capital': None}\n", - "{'index': 153, 'country_name': 'Guatemala', 'capital': None}\n", - "{'index': 154, 'country_name': 'Grenada', 'capital': None}\n", - "{'index': 155, 'country_name': 'Greece', 'capital': None}\n", - "{'index': 156, 'country_name': 'Ghana', 'capital': None}\n", - "{'index': 157, 'country_name': 'Germany', 'capital': None}\n", - "{'index': 158, 'country_name': 'Georgia (country)', 'capital': None}\n", - "{'index': 159, 'country_name': 'The Gambia', 'capital': None}\n", - "{'index': 160, 'country_name': 'Gabon', 'capital': None}\n", - "{'index': 161, 'country_name': 'Finland', 'capital': None}\n", - "{'index': 162, 'country_name': 'Fiji', 'capital': None}\n", - "{'index': 163, 'country_name': 'Ethiopia', 'capital': None}\n", - "{'index': 164, 'country_name': 'Eswatini', 'capital': None}\n", - "{'index': 165, 'country_name': 'Estonia', 'capital': None}\n", - "{'index': 166, 'country_name': 'France', 'capital': None}\n", - "{'index': 167, 'country_name': 'Eritrea', 'capital': None}\n", - "{'index': 168, 'country_name': 'Equatorial Guinea', 'capital': None}\n", - "{'index': 169, 'country_name': 'El Salvador', 'capital': None}\n", - "{'index': 170, 'country_name': 'Egypt', 'capital': None}\n", - "{'index': 171, 'country_name': 'Ecuador', 'capital': None}\n", - "{'index': 172, 'country_name': 'East Timor', 'capital': None}\n", - "{'index': 173, 'country_name': 'Dominica', 'capital': None}\n", - "{'index': 174, 'country_name': 'Dominican Republic', 'capital': None}\n", - "{'index': 175, 'country_name': 'Djibouti', 'capital': None}\n", - "{'index': 176, 'country_name': 'Danish Realm', 'capital': None}\n", - "{'index': 177, 'country_name': 'Czech Republic', 'capital': None}\n", - "{'index': 178, 'country_name': 'Cyprus', 'capital': None}\n", - "{'index': 179, 'country_name': 'Cuba', 'capital': None}\n", - "{'index': 180, 'country_name': 'Burkina Faso', 'capital': None}\n", - "{'index': 181, 'country_name': 'Bulgaria', 'capital': None}\n", - "{'index': 182, 'country_name': 'Brunei', 'capital': None}\n", - "{'index': 183, 'country_name': 'Brazil', 'capital': None}\n", - "{'index': 184, 'country_name': 'Botswana', 'capital': None}\n", - "{'index': 185, 'country_name': 'Bosnia and Herzegovina', 'capital': None}\n", - "{'index': 186, 'country_name': 'Bolivia', 'capital': None}\n", - "{'index': 187, 'country_name': 'Bhutan', 'capital': None}\n", - "{'index': 188, 'country_name': 'Benin', 'capital': None}\n", - "{'index': 189, 'country_name': 'Belize', 'capital': None}\n", - "{'index': 190, 'country_name': 'Belgium', 'capital': None}\n", - "{'index': 191, 'country_name': 'Belarus', 'capital': None}\n", - "{'index': 192, 'country_name': 'Barbados', 'capital': None}\n", - "{'index': 193, 'country_name': 'Bangladesh', 'capital': None}\n", - "{'index': 194, 'country_name': 'Bahrain', 'capital': None}\n", - "{'index': 195, 'country_name': 'The Bahamas', 'capital': None}\n", - "{'index': 196, 'country_name': 'Azerbaijan', 'capital': None}\n", - "{'index': 197, 'country_name': 'Austria', 'capital': None}\n", - "{'index': 198, 'country_name': 'Australia', 'capital': None}\n", - "{'index': 199, 'country_name': 'Armenia', 'capital': None}\n", - "{'index': 200, 'country_name': 'Argentina', 'capital': None}\n", - "{'index': 201, 'country_name': 'Antigua and Barbuda', 'capital': '\\n'}\n", - "{'index': 202, 'country_name': 'Angola', 'capital': None}\n", - "{'index': 203, 'country_name': 'Andorra', 'capital': None}\n", - "{'index': 204, 'country_name': 'Algeria', 'capital': None}\n", - "{'index': 205, 'country_name': 'Albania', 'capital': None}\n" + "{'index': 23, 'country_name': 'Kosovo', 'capital': []}\n", + "33 Vatican City\n", + "64 Singapore\n", + "{'index': 201, 'country_name': 'Antigua and Barbuda', 'capital': []}\n" ] } ], @@ -2012,8 +1991,14 @@ "for index, country_name, country_html in zip(\n", " countries.index, countries[\"short_country_name\"], countries[\"country_html\"]\n", "):\n", - " # print(json.dumps(extract_capital_0(index, country_name, country_html)))\n", - " print(extract_capital_0(index, country_name, country_html))" + " result = extract_capital_0(index, country_name, country_html)\n", + " try:\n", + " if len(result[\"capital\"]) == 0:\n", + " print(result)\n", + " except TypeError:\n", + " print(index, country_name)\n", + "\n", + " # print(json.dumps(extract_capital_0(index, country_name, country_html)))" ] }, { @@ -2023,106 +2008,6 @@ "metadata": {}, "outputs": [], "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4cc8e0f7-36ab-41b8-8ce9-d0ebe02a9896", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "914e0498-c75d-4c40-8320-ec62f5db8764", - "metadata": { - "execution": { - "iopub.execute_input": "2022-06-25T20:16:19.646094Z", - "iopub.status.busy": "2022-06-25T20:16:19.645699Z", - "iopub.status.idle": "2022-06-25T20:16:19.649925Z", - "shell.execute_reply": "2022-06-25T20:16:19.648982Z", - "shell.execute_reply.started": "2022-06-25T20:16:19.646064Z" - } - }, - "source": [ - "## Playground" - ] - }, - { - "cell_type": "code", - "execution_count": 126, - "id": "57d8b5a9-dd20-4214-866a-b57ecc7ad5da", - "metadata": { - "execution": { - "iopub.execute_input": "2022-06-25T22:09:14.820234Z", - "iopub.status.busy": "2022-06-25T22:09:14.819573Z", - "iopub.status.idle": "2022-06-25T22:09:14.830451Z", - "shell.execute_reply": "2022-06-25T22:09:14.829525Z", - "shell.execute_reply.started": "2022-06-25T22:09:14.820174Z" - }, - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "country_url https://en.wikipedia.org/wiki/Malaysia\n", - "flag_description_url https://en.wikipedia.org/wiki/Flag_of_Malaysia\n", - "short_country_name Malaysia\n", - "country_html
The \u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m index \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m116\u001b[39m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mextract_capital_0\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcountries\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43miloc\u001b[49m\u001b[43m[\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mshort_country_name\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcountries\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43miloc\u001b[49m\u001b[43m[\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcountry_html\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n", - "Input \u001b[0;32mIn [120]\u001b[0m, in \u001b[0;36mextract_capital_0\u001b[0;34m(index, country_name, country_html)\u001b[0m\n\u001b[1;32m 17\u001b[0m _capitals \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m capital_filter:\n\u001b[0;32m---> 19\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[43mre\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43md\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m, re\u001b[38;5;241m.\u001b[39mMatch):\n\u001b[1;32m 20\u001b[0m _capitals\u001b[38;5;241m.\u001b[39mappend(item\u001b[38;5;241m.\u001b[39mtext)\n\u001b[1;32m 21\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m match \u001b[38;5;129;01min\u001b[39;00m branch_filter:\n", - "File \u001b[0;32m~/.pyenv/versions/3.8.12/lib/python3.8/re.py:201\u001b[0m, in \u001b[0;36msearch\u001b[0;34m(pattern, string, flags)\u001b[0m\n\u001b[1;32m 198\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msearch\u001b[39m(pattern, string, flags\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m):\n\u001b[1;32m 199\u001b[0m \u001b[38;5;124;03m\"\"\"Scan through string looking for a match to the pattern, returning\u001b[39;00m\n\u001b[1;32m 200\u001b[0m \u001b[38;5;124;03m a Match object, or None if no match was found.\"\"\"\u001b[39;00m\n\u001b[0;32m--> 201\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_compile\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpattern\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mflags\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstring\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[0;31mTypeError\u001b[0m: expected string or bytes-like object" - ] - } - ], - "source": [ - "index = 116\n", - "extract_capital_0(index, countries.iloc[index][\"short_country_name\"], countries.iloc[index][\"country_html\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e066fde4-b290-49d7-80d2-64e6003efcec", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": {