diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py index 45d714a..8560523 100644 --- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py +++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py @@ -8,7 +8,7 @@ import scrapy class WikipediaCountryScraperItem(scrapy.Item): country_url = scrapy.Field() - flag_image_url = scrapy.Field() + flag_description_url = scrapy.Field() short_country_name = scrapy.Field() country_html = scrapy.Field() flag_html = scrapy.Field() diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader.py index 8d130f7..e42b339 100644 --- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader.py +++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader.py @@ -50,14 +50,9 @@ class CountrydownloaderSpider(scrapy.Spider): "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[1]/a/@href" ).get() - try: - flag_description_url = response.xpath( - "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div/a/@href" - ).getall()[-1] - except IndexError: - flag_description_url = response.xpath( - "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div/a/@href" - ).get() + flag_description_url = response.xpath( + "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div/a[not(contains(@href, 'cite_note'))]/@href" + ).getall()[-1] country_item = { **country_item, @@ -71,6 +66,7 @@ class CountrydownloaderSpider(scrapy.Spider): "country_item": country_item, "urls": { "flag_image_url": f"https://en.wikipedia.org{flag_image_url}", + "flag_description_url": f"https://en.wikipedia.org{flag_description_url}", }, }, ) @@ -103,7 +99,7 @@ class CountrydownloaderSpider(scrapy.Spider): # yield the country item containing scraped data country_scrapy_item = WikipediaCountryScraperItem() country_scrapy_item["country_url"] = country_item["country_url"] - country_scrapy_item["flag_image_url"] = urls["flag_image_url"] + country_scrapy_item["flag_description_url"] = urls["flag_description_url"] country_scrapy_item["short_country_name"] = country_item["short_country_name"] country_scrapy_item["country_html"] = country_item["country_html"] country_scrapy_item["flag_html"] = country_item["flag_html"] diff --git a/playground/downloaded_data_inspection_lab/Untitled.ipynb b/playground/downloaded_data_inspection_lab/Untitled.ipynb index f890fec..5f91a00 100644 --- a/playground/downloaded_data_inspection_lab/Untitled.ipynb +++ b/playground/downloaded_data_inspection_lab/Untitled.ipynb @@ -79,15 +79,15 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 74, "id": "d03be94e-8642-4916-8a43-1711e0c21b36", "metadata": { "execution": { - "iopub.execute_input": "2022-06-24T21:32:00.473759Z", - "iopub.status.busy": "2022-06-24T21:32:00.473129Z", - "iopub.status.idle": "2022-06-24T21:32:00.812851Z", - "shell.execute_reply": "2022-06-24T21:32:00.812131Z", - "shell.execute_reply.started": "2022-06-24T21:32:00.473730Z" + "iopub.execute_input": "2022-06-24T21:52:35.726961Z", + "iopub.status.busy": "2022-06-24T21:52:35.726356Z", + "iopub.status.idle": "2022-06-24T21:52:35.823456Z", + "shell.execute_reply": "2022-06-24T21:52:35.822464Z", + "shell.execute_reply.started": "2022-06-24T21:52:35.726932Z" }, "tags": [] }, @@ -100,15 +100,15 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 75, "id": "29cca9ea-16d3-4534-8c9e-49fde37f8cdd", "metadata": { "execution": { - "iopub.execute_input": "2022-06-24T21:32:01.728669Z", - "iopub.status.busy": "2022-06-24T21:32:01.728021Z", - "iopub.status.idle": "2022-06-24T21:32:01.919644Z", - "shell.execute_reply": "2022-06-24T21:32:01.917919Z", - "shell.execute_reply.started": "2022-06-24T21:32:01.728629Z" + "iopub.execute_input": "2022-06-24T21:52:36.453655Z", + "iopub.status.busy": "2022-06-24T21:52:36.452202Z", + "iopub.status.idle": "2022-06-24T21:52:36.555461Z", + "shell.execute_reply": "2022-06-24T21:52:36.554096Z", + "shell.execute_reply.started": "2022-06-24T21:52:36.453559Z" }, "tags": [] }, @@ -119,15 +119,15 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 76, "id": "ef8bc3ce-08dd-4260-807c-2616b2e1c1ba", "metadata": { "execution": { - "iopub.execute_input": "2022-06-24T21:32:04.251701Z", - "iopub.status.busy": "2022-06-24T21:32:04.250831Z", - "iopub.status.idle": "2022-06-24T21:32:04.263351Z", - "shell.execute_reply": "2022-06-24T21:32:04.262556Z", - "shell.execute_reply.started": "2022-06-24T21:32:04.251670Z" + "iopub.execute_input": "2022-06-24T21:52:36.950435Z", + "iopub.status.busy": "2022-06-24T21:52:36.949946Z", + "iopub.status.idle": "2022-06-24T21:52:36.959935Z", + "shell.execute_reply": "2022-06-24T21:52:36.958581Z", + "shell.execute_reply.started": "2022-06-24T21:52:36.950398Z" }, "tags": [] }, @@ -151,15 +151,15 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 77, "id": "48db8f93-659b-45a4-8477-a7cec139bebc", "metadata": { "execution": { - "iopub.execute_input": "2022-06-24T21:32:05.191313Z", - "iopub.status.busy": "2022-06-24T21:32:05.190427Z", - "iopub.status.idle": "2022-06-24T21:32:05.233768Z", - "shell.execute_reply": "2022-06-24T21:32:05.232310Z", - "shell.execute_reply.started": "2022-06-24T21:32:05.191232Z" + "iopub.execute_input": "2022-06-24T21:52:37.635010Z", + "iopub.status.busy": "2022-06-24T21:52:37.634417Z", + "iopub.status.idle": "2022-06-24T21:52:37.645162Z", + "shell.execute_reply": "2022-06-24T21:52:37.643796Z", + "shell.execute_reply.started": "2022-06-24T21:52:37.634953Z" }, "tags": [] }, @@ -183,15 +183,15 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 81, "id": "a52f6aa2-5bbd-46e4-9b2f-cdbd7269cb6e", "metadata": { "execution": { - "iopub.execute_input": "2022-06-24T21:36:11.684046Z", - "iopub.status.busy": "2022-06-24T21:36:11.683658Z", - "iopub.status.idle": "2022-06-24T21:36:11.690174Z", - "shell.execute_reply": "2022-06-24T21:36:11.689279Z", - "shell.execute_reply.started": "2022-06-24T21:36:11.684015Z" + "iopub.execute_input": "2022-06-24T21:52:49.070882Z", + "iopub.status.busy": "2022-06-24T21:52:49.070107Z", + "iopub.status.idle": "2022-06-24T21:52:49.076033Z", + "shell.execute_reply": "2022-06-24T21:52:49.075243Z", + "shell.execute_reply.started": "2022-06-24T21:52:49.070853Z" }, "tags": [] }, @@ -215,15 +215,15 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 84, "id": "643e6512-1e5b-4eb2-9f0a-6b680ada787b", "metadata": { "execution": { - "iopub.execute_input": "2022-06-24T21:36:13.608192Z", - "iopub.status.busy": "2022-06-24T21:36:13.607778Z", - "iopub.status.idle": "2022-06-24T21:36:13.623089Z", - "shell.execute_reply": "2022-06-24T21:36:13.622311Z", - "shell.execute_reply.started": "2022-06-24T21:36:13.608162Z" + "iopub.execute_input": "2022-06-24T21:53:15.797108Z", + "iopub.status.busy": "2022-06-24T21:53:15.796761Z", + "iopub.status.idle": "2022-06-24T21:53:15.809904Z", + "shell.execute_reply": "2022-06-24T21:53:15.809157Z", + "shell.execute_reply.started": "2022-06-24T21:53:15.797079Z" }, "tags": [] }, @@ -250,6 +250,7 @@ " \n", " \n", " country_url\n", + " flag_image_url\n", " short_country_name\n", " country_html\n", " flag_html\n", @@ -261,6 +262,7 @@ " \n", " 84\n", " https://en.wikipedia.org/wiki/Paraguay\n", + " https://en.wikipedia.org/wiki/File:Flag_of_Par...\n", " Paraguay\n", " [<tr><th colspan=\"2\" class=\"infobox-above adr\"...\n", " None\n", @@ -272,8 +274,11 @@ "" ], "text/plain": [ - " country_url short_country_name \\\n", - "84 https://en.wikipedia.org/wiki/Paraguay Paraguay \n", + " country_url \\\n", + "84 https://en.wikipedia.org/wiki/Paraguay \n", + "\n", + " flag_image_url short_country_name \\\n", + "84 https://en.wikipedia.org/wiki/File:Flag_of_Par... Paraguay \n", "\n", " country_html flag_html \\\n", "84 [\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
country_urlflag_image_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
84https://en.wikipedia.org/wiki/Paraguayhttps://en.wikipedia.org/wiki/File:Flag_of_Par...Paraguay[<tr><th colspan=\"2\" class=\"infobox-above adr\"...None[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
\n", + "" + ], + "text/plain": [ + " country_url \\\n", + "84 https://en.wikipedia.org/wiki/Paraguay \n", + "\n", + " flag_image_url short_country_name \\\n", + "84 https://en.wikipedia.org/wiki/File:Flag_of_Par... Paraguay \n", + "\n", + " country_html flag_html \\\n", + "84 [