diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py index 5324d89..45d714a 100644 --- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py +++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py @@ -8,6 +8,7 @@ import scrapy class WikipediaCountryScraperItem(scrapy.Item): country_url = scrapy.Field() + flag_image_url = scrapy.Field() short_country_name = scrapy.Field() country_html = scrapy.Field() flag_html = scrapy.Field() diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader.py index ccd67f9..8d130f7 100644 --- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader.py +++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader.py @@ -49,9 +49,15 @@ class CountrydownloaderSpider(scrapy.Spider): flag_image_url = response.xpath( "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[1]/a/@href" ).get() - flag_description_url = response.xpath( - "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div/a/@href" - ).getall()[-1] + + try: + flag_description_url = response.xpath( + "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div/a/@href" + ).getall()[-1] + except IndexError: + flag_description_url = response.xpath( + "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div/a/@href" + ).get() country_item = { **country_item, @@ -97,6 +103,7 @@ class CountrydownloaderSpider(scrapy.Spider): # yield the country item containing scraped data country_scrapy_item = WikipediaCountryScraperItem() country_scrapy_item["country_url"] = country_item["country_url"] + country_scrapy_item["flag_image_url"] = urls["flag_image_url"] country_scrapy_item["short_country_name"] = country_item["short_country_name"] country_scrapy_item["country_html"] = country_item["country_html"] country_scrapy_item["flag_html"] = country_item["flag_html"] diff --git a/playground/downloaded_data_inspection_lab/Untitled.ipynb b/playground/downloaded_data_inspection_lab/Untitled.ipynb index 1163fcc..f890fec 100644 --- a/playground/downloaded_data_inspection_lab/Untitled.ipynb +++ b/playground/downloaded_data_inspection_lab/Untitled.ipynb @@ -2,9 +2,16 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "id": "d8185790-0793-4881-99e8-6730f95a8006", "metadata": { + "execution": { + "iopub.execute_input": "2022-06-24T21:31:57.297266Z", + "iopub.status.busy": "2022-06-24T21:31:57.284090Z", + "iopub.status.idle": "2022-06-24T21:31:57.366471Z", + "shell.execute_reply": "2022-06-24T21:31:57.365193Z", + "shell.execute_reply.started": "2022-06-24T21:31:57.293844Z" + }, "tags": [] }, "outputs": [], @@ -17,12 +24,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "id": "ea2b3e33-d58e-4e30-a0cc-8218a1f252c9", "metadata": { + "execution": { + "iopub.execute_input": "2022-06-24T21:31:58.025200Z", + "iopub.status.busy": "2022-06-24T21:31:58.024201Z", + "iopub.status.idle": "2022-06-24T21:31:58.108904Z", + "shell.execute_reply": "2022-06-24T21:31:58.107402Z", + "shell.execute_reply.started": "2022-06-24T21:31:58.025121Z" + }, "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[None]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pd_options = {\n", " \"display.max_rows\": None,\n", @@ -33,9 +58,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, "id": "36149580-91d9-431d-99c3-51feee829e79", "metadata": { + "execution": { + "iopub.execute_input": "2022-06-24T21:31:58.650508Z", + "iopub.status.busy": "2022-06-24T21:31:58.650001Z", + "iopub.status.idle": "2022-06-24T21:31:58.670264Z", + "shell.execute_reply": "2022-06-24T21:31:58.669296Z", + "shell.execute_reply.started": "2022-06-24T21:31:58.650473Z" + }, "tags": [] }, "outputs": [], @@ -47,9 +79,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "id": "d03be94e-8642-4916-8a43-1711e0c21b36", "metadata": { + "execution": { + "iopub.execute_input": "2022-06-24T21:32:00.473759Z", + "iopub.status.busy": "2022-06-24T21:32:00.473129Z", + "iopub.status.idle": "2022-06-24T21:32:00.812851Z", + "shell.execute_reply": "2022-06-24T21:32:00.812131Z", + "shell.execute_reply.started": "2022-06-24T21:32:00.473730Z" + }, "tags": [] }, "outputs": [], @@ -61,9 +100,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "id": "29cca9ea-16d3-4534-8c9e-49fde37f8cdd", "metadata": { + "execution": { + "iopub.execute_input": "2022-06-24T21:32:01.728669Z", + "iopub.status.busy": "2022-06-24T21:32:01.728021Z", + "iopub.status.idle": "2022-06-24T21:32:01.919644Z", + "shell.execute_reply": "2022-06-24T21:32:01.917919Z", + "shell.execute_reply.started": "2022-06-24T21:32:01.728629Z" + }, "tags": [] }, "outputs": [], @@ -73,12 +119,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "id": "ef8bc3ce-08dd-4260-807c-2616b2e1c1ba", "metadata": { + "execution": { + "iopub.execute_input": "2022-06-24T21:32:04.251701Z", + "iopub.status.busy": "2022-06-24T21:32:04.250831Z", + "iopub.status.idle": "2022-06-24T21:32:04.263351Z", + "shell.execute_reply": "2022-06-24T21:32:04.262556Z", + "shell.execute_reply.started": "2022-06-24T21:32:04.251670Z" + }, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(206,)\n", + "[False]\n", + "[False]\n" + ] + } + ], "source": [ "country_url = df[\"country_url\"]\n", "print(country_url.shape)\n", @@ -88,12 +151,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "id": "48db8f93-659b-45a4-8477-a7cec139bebc", "metadata": { + "execution": { + "iopub.execute_input": "2022-06-24T21:32:05.191313Z", + "iopub.status.busy": "2022-06-24T21:32:05.190427Z", + "iopub.status.idle": "2022-06-24T21:32:05.233768Z", + "shell.execute_reply": "2022-06-24T21:32:05.232310Z", + "shell.execute_reply.started": "2022-06-24T21:32:05.191232Z" + }, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(206,)\n", + "[False]\n", + "[False]\n" + ] + } + ], "source": [ "short_country_name = df[\"short_country_name\"]\n", "print(short_country_name.shape)\n", @@ -103,43 +183,148 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 58, "id": "a52f6aa2-5bbd-46e4-9b2f-cdbd7269cb6e", "metadata": { + "execution": { + "iopub.execute_input": "2022-06-24T21:36:11.684046Z", + "iopub.status.busy": "2022-06-24T21:36:11.683658Z", + "iopub.status.idle": "2022-06-24T21:36:11.690174Z", + "shell.execute_reply": "2022-06-24T21:36:11.689279Z", + "shell.execute_reply.started": "2022-06-24T21:36:11.684015Z" + }, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(206,)\n", + "[False True]\n", + "[False True]\n" + ] + } + ], "source": [ - "flag_description = df[\"flag_description\"]\n", - "print(flag_description.shape)\n", - "print(flag_description.isnull().unique())\n", - "print(flag_description.isna().unique())" + "flag_html = df[\"flag_html\"]\n", + "print(flag_html.shape)\n", + "print(flag_html.isnull().unique())\n", + "print(flag_html.isna().unique())" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 59, "id": "643e6512-1e5b-4eb2-9f0a-6b680ada787b", "metadata": { + "execution": { + "iopub.execute_input": "2022-06-24T21:36:13.608192Z", + "iopub.status.busy": "2022-06-24T21:36:13.607778Z", + "iopub.status.idle": "2022-06-24T21:36:13.623089Z", + "shell.execute_reply": "2022-06-24T21:36:13.622311Z", + "shell.execute_reply.started": "2022-06-24T21:36:13.608162Z" + }, "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
country_urlshort_country_namecountry_htmlflag_htmlfile_urlsfiles
84https://en.wikipedia.org/wiki/ParaguayParaguay[<tr><th colspan=\"2\" class=\"infobox-above adr\"...None[https:////upload.wikimedia.org/wikipedia/comm...[{'url': 'https://upload.wikimedia.org/wikiped...
\n", + "
" + ], + "text/plain": [ + " country_url short_country_name \\\n", + "84 https://en.wikipedia.org/wiki/Paraguay Paraguay \n", + "\n", + " country_html flag_html \\\n", + "84 [