diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py index c578484..f06a812 100644 --- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py +++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py @@ -4,6 +4,7 @@ # https://docs.scrapy.org/en/latest/topics/items.html import scrapy +from wikipedia_country_scraper.items import WikipediaCountryScraperItem class WikipediaCountryScraperItem(scrapy.Item): @@ -24,3 +25,8 @@ class AnthemsItem(scrapy.Item): file_urls = scrapy.Field() files = scrapy.Field() + + +class CapitalsItem(scrapy.Item): + country_name = scrapy.Field() + capitals = scrapy.Field() diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/capitals.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/capitals.py new file mode 100644 index 0000000..15df8f3 --- /dev/null +++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/capitals.py @@ -0,0 +1,37 @@ +import scrapy +from scrapy.http import TextResponse +from wikipedia_country_scraper.items import CapitalsItem + + +class CapitalsSpider(scrapy.Spider): + name = "capitals" + start_urls = [ + "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_and_their_capitals_in_native_languages" + ] + custom_settings = { + "FEEDS": { + pathlib.Path(__file__).resolve().parents[4] + / "data" + / "scrapy" + / "raw_country_data" + / "capitals.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2} + }, + } + + def parse(self, response: TextResponse): + _country = response.xpath("//table[@class='wikitable']/tbody/tr[not(@style='background:#ccc;')]") + + country_names = [country_name.get() for country_name in _country.xpath("td[1]//a[@title]/text()")] + + capital_names = [] + for capital in _country.xpath("td[2]"): + _capitals = [subcapital.get() for subcapital in capital.xpath("a[@title]/text()")] + capital_names.append(_capitals) + + for country, capitals in zip(country_names, capital_names): + capital_item = CapitalsItem() + + capital_item["country_name"] = country + capital_item["capitals"] = capitals + + yield capital_item diff --git a/playground/downloaded_data_inspection_lab/capital_xpath.ipynb b/playground/downloaded_data_inspection_lab/capital_xpath.ipynb index 4bed7e5..ec411ef 100644 --- a/playground/downloaded_data_inspection_lab/capital_xpath.ipynb +++ b/playground/downloaded_data_inspection_lab/capital_xpath.ipynb @@ -1561,7 +1561,9 @@ } ], "source": [ - "countries[countries[\"short_country_name\"].map(lambda country: \"arab\" in country.lower())]" + "countries[\n", + " countries[\"short_country_name\"].map(lambda country: \"arab\" in country.lower())\n", + "]" ] }, { @@ -1661,7 +1663,10 @@ " for match in branch_filter:\n", " print(etree.tostring(match))\n", " print(\"match\", match.text)\n", - " if isinstance(re.search(r\"(?:i\\>)([^\\<)]*)\", etree.tostring(match).decode(\"utf-8\")), re.Match):\n", + " if isinstance(\n", + " re.search(r\"(?:i\\>)([^\\<)]*)\", etree.tostring(match).decode(\"utf-8\")),\n", + " re.Match,\n", + " ):\n", " _capitals.append(match.text)\n", " result = {\n", " \"index\": 20,\n", @@ -1702,6 +1707,169 @@ "result" ] }, + { + "cell_type": "markdown", + "id": "7dbe415a-ba82-4813-9723-ac66ec9b29aa", + "metadata": {}, + "source": [ + "#### State of Palestine" + ] + }, + { + "cell_type": "code", + "execution_count": 212, + "id": "b39a6451-19e0-4ec6-a925-89bcdb89c441", + "metadata": { + "execution": { + "iopub.execute_input": "2022-06-25T23:47:35.380713Z", + "iopub.status.busy": "2022-06-25T23:47:35.380097Z", + "iopub.status.idle": "2022-06-25T23:47:35.485628Z", + "shell.execute_reply": "2022-06-25T23:47:35.484621Z", + "shell.execute_reply.started": "2022-06-25T23:47:35.380654Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | country_url | \n", + "flag_description_url | \n", + "short_country_name | \n", + "country_html | \n", + "flag_html | \n", + "file_urls | \n", + "files | \n", + "
|---|---|---|---|---|---|---|---|
| 87 | \n", + "https://en.wikipedia.org/wiki/State_of_Palestine | \n", + "https://en.wikipedia.org/wiki/Flag_of_Palestine | \n", + "State of Palestine | \n", + "<div><tr><th colspan=\"2\" class=\"infobox-above ... | \n", + "<p>The <b>flag of Palestine</b> (<a href=\"/wik... | \n", + "[https:////upload.wikimedia.org/wikipedia/comm... | \n", + "[{'url': 'https://upload.wikimedia.org/wikiped... | \n", + "