chore: add flag image url to saved item

This commit is contained in:
2022-06-24 22:43:15 +01:00
parent 17b0462da5
commit fa26c99ba5
3 changed files with 220 additions and 27 deletions

View File

@@ -8,6 +8,7 @@ import scrapy
class WikipediaCountryScraperItem(scrapy.Item): class WikipediaCountryScraperItem(scrapy.Item):
country_url = scrapy.Field() country_url = scrapy.Field()
flag_image_url = scrapy.Field()
short_country_name = scrapy.Field() short_country_name = scrapy.Field()
country_html = scrapy.Field() country_html = scrapy.Field()
flag_html = scrapy.Field() flag_html = scrapy.Field()

View File

@@ -49,9 +49,15 @@ class CountrydownloaderSpider(scrapy.Spider):
flag_image_url = response.xpath( flag_image_url = response.xpath(
"//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[1]/a/@href" "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[1]/a/@href"
).get() ).get()
flag_description_url = response.xpath(
"//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div/a/@href" try:
).getall()[-1] flag_description_url = response.xpath(
"//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div/a/@href"
).getall()[-1]
except IndexError:
flag_description_url = response.xpath(
"//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div/a/@href"
).get()
country_item = { country_item = {
**country_item, **country_item,
@@ -97,6 +103,7 @@ class CountrydownloaderSpider(scrapy.Spider):
# yield the country item containing scraped data # yield the country item containing scraped data
country_scrapy_item = WikipediaCountryScraperItem() country_scrapy_item = WikipediaCountryScraperItem()
country_scrapy_item["country_url"] = country_item["country_url"] country_scrapy_item["country_url"] = country_item["country_url"]
country_scrapy_item["flag_image_url"] = urls["flag_image_url"]
country_scrapy_item["short_country_name"] = country_item["short_country_name"] country_scrapy_item["short_country_name"] = country_item["short_country_name"]
country_scrapy_item["country_html"] = country_item["country_html"] country_scrapy_item["country_html"] = country_item["country_html"]
country_scrapy_item["flag_html"] = country_item["flag_html"] country_scrapy_item["flag_html"] = country_item["flag_html"]

View File

@@ -2,9 +2,16 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 38,
"id": "d8185790-0793-4881-99e8-6730f95a8006", "id": "d8185790-0793-4881-99e8-6730f95a8006",
"metadata": { "metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T21:31:57.297266Z",
"iopub.status.busy": "2022-06-24T21:31:57.284090Z",
"iopub.status.idle": "2022-06-24T21:31:57.366471Z",
"shell.execute_reply": "2022-06-24T21:31:57.365193Z",
"shell.execute_reply.started": "2022-06-24T21:31:57.293844Z"
},
"tags": [] "tags": []
}, },
"outputs": [], "outputs": [],
@@ -17,12 +24,30 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 39,
"id": "ea2b3e33-d58e-4e30-a0cc-8218a1f252c9", "id": "ea2b3e33-d58e-4e30-a0cc-8218a1f252c9",
"metadata": { "metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T21:31:58.025200Z",
"iopub.status.busy": "2022-06-24T21:31:58.024201Z",
"iopub.status.idle": "2022-06-24T21:31:58.108904Z",
"shell.execute_reply": "2022-06-24T21:31:58.107402Z",
"shell.execute_reply.started": "2022-06-24T21:31:58.025121Z"
},
"tags": [] "tags": []
}, },
"outputs": [], "outputs": [
{
"data": {
"text/plain": [
"[None]"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"pd_options = {\n", "pd_options = {\n",
" \"display.max_rows\": None,\n", " \"display.max_rows\": None,\n",
@@ -33,9 +58,16 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 40,
"id": "36149580-91d9-431d-99c3-51feee829e79", "id": "36149580-91d9-431d-99c3-51feee829e79",
"metadata": { "metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T21:31:58.650508Z",
"iopub.status.busy": "2022-06-24T21:31:58.650001Z",
"iopub.status.idle": "2022-06-24T21:31:58.670264Z",
"shell.execute_reply": "2022-06-24T21:31:58.669296Z",
"shell.execute_reply.started": "2022-06-24T21:31:58.650473Z"
},
"tags": [] "tags": []
}, },
"outputs": [], "outputs": [],
@@ -47,9 +79,16 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 41,
"id": "d03be94e-8642-4916-8a43-1711e0c21b36", "id": "d03be94e-8642-4916-8a43-1711e0c21b36",
"metadata": { "metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T21:32:00.473759Z",
"iopub.status.busy": "2022-06-24T21:32:00.473129Z",
"iopub.status.idle": "2022-06-24T21:32:00.812851Z",
"shell.execute_reply": "2022-06-24T21:32:00.812131Z",
"shell.execute_reply.started": "2022-06-24T21:32:00.473730Z"
},
"tags": [] "tags": []
}, },
"outputs": [], "outputs": [],
@@ -61,9 +100,16 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 42,
"id": "29cca9ea-16d3-4534-8c9e-49fde37f8cdd", "id": "29cca9ea-16d3-4534-8c9e-49fde37f8cdd",
"metadata": { "metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T21:32:01.728669Z",
"iopub.status.busy": "2022-06-24T21:32:01.728021Z",
"iopub.status.idle": "2022-06-24T21:32:01.919644Z",
"shell.execute_reply": "2022-06-24T21:32:01.917919Z",
"shell.execute_reply.started": "2022-06-24T21:32:01.728629Z"
},
"tags": [] "tags": []
}, },
"outputs": [], "outputs": [],
@@ -73,12 +119,29 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 43,
"id": "ef8bc3ce-08dd-4260-807c-2616b2e1c1ba", "id": "ef8bc3ce-08dd-4260-807c-2616b2e1c1ba",
"metadata": { "metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T21:32:04.251701Z",
"iopub.status.busy": "2022-06-24T21:32:04.250831Z",
"iopub.status.idle": "2022-06-24T21:32:04.263351Z",
"shell.execute_reply": "2022-06-24T21:32:04.262556Z",
"shell.execute_reply.started": "2022-06-24T21:32:04.251670Z"
},
"tags": [] "tags": []
}, },
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(206,)\n",
"[False]\n",
"[False]\n"
]
}
],
"source": [ "source": [
"country_url = df[\"country_url\"]\n", "country_url = df[\"country_url\"]\n",
"print(country_url.shape)\n", "print(country_url.shape)\n",
@@ -88,12 +151,29 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 44,
"id": "48db8f93-659b-45a4-8477-a7cec139bebc", "id": "48db8f93-659b-45a4-8477-a7cec139bebc",
"metadata": { "metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T21:32:05.191313Z",
"iopub.status.busy": "2022-06-24T21:32:05.190427Z",
"iopub.status.idle": "2022-06-24T21:32:05.233768Z",
"shell.execute_reply": "2022-06-24T21:32:05.232310Z",
"shell.execute_reply.started": "2022-06-24T21:32:05.191232Z"
},
"tags": [] "tags": []
}, },
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(206,)\n",
"[False]\n",
"[False]\n"
]
}
],
"source": [ "source": [
"short_country_name = df[\"short_country_name\"]\n", "short_country_name = df[\"short_country_name\"]\n",
"print(short_country_name.shape)\n", "print(short_country_name.shape)\n",
@@ -103,43 +183,148 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 58,
"id": "a52f6aa2-5bbd-46e4-9b2f-cdbd7269cb6e", "id": "a52f6aa2-5bbd-46e4-9b2f-cdbd7269cb6e",
"metadata": { "metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T21:36:11.684046Z",
"iopub.status.busy": "2022-06-24T21:36:11.683658Z",
"iopub.status.idle": "2022-06-24T21:36:11.690174Z",
"shell.execute_reply": "2022-06-24T21:36:11.689279Z",
"shell.execute_reply.started": "2022-06-24T21:36:11.684015Z"
},
"tags": [] "tags": []
}, },
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(206,)\n",
"[False True]\n",
"[False True]\n"
]
}
],
"source": [ "source": [
"flag_description = df[\"flag_description\"]\n", "flag_html = df[\"flag_html\"]\n",
"print(flag_description.shape)\n", "print(flag_html.shape)\n",
"print(flag_description.isnull().unique())\n", "print(flag_html.isnull().unique())\n",
"print(flag_description.isna().unique())" "print(flag_html.isna().unique())"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 59,
"id": "643e6512-1e5b-4eb2-9f0a-6b680ada787b", "id": "643e6512-1e5b-4eb2-9f0a-6b680ada787b",
"metadata": { "metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T21:36:13.608192Z",
"iopub.status.busy": "2022-06-24T21:36:13.607778Z",
"iopub.status.idle": "2022-06-24T21:36:13.623089Z",
"shell.execute_reply": "2022-06-24T21:36:13.622311Z",
"shell.execute_reply.started": "2022-06-24T21:36:13.608162Z"
},
"tags": [] "tags": []
}, },
"outputs": [], "outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>country_url</th>\n",
" <th>short_country_name</th>\n",
" <th>country_html</th>\n",
" <th>flag_html</th>\n",
" <th>file_urls</th>\n",
" <th>files</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>84</th>\n",
" <td>https://en.wikipedia.org/wiki/Paraguay</td>\n",
" <td>Paraguay</td>\n",
" <td>[&lt;tr&gt;&lt;th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
" <td>None</td>\n",
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" country_url short_country_name \\\n",
"84 https://en.wikipedia.org/wiki/Paraguay Paraguay \n",
"\n",
" country_html flag_html \\\n",
"84 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... None \n",
"\n",
" file_urls \\\n",
"84 [https:////upload.wikimedia.org/wikipedia/comm... \n",
"\n",
" files \n",
"84 [{'url': 'https://upload.wikimedia.org/wikiped... "
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"no_anthem = [item for item in [df[\"files\"]][0] if len(item) == 1]\n", "df[df[\"flag_html\"].isnull()]"
"print(len(no_anthem))\n",
"no_anthem"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 64,
"id": "227b0c76-9e45-4849-849e-36355976cba9", "id": "227b0c76-9e45-4849-849e-36355976cba9",
"metadata": { "metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T21:39:06.360439Z",
"iopub.status.busy": "2022-06-24T21:39:06.360082Z",
"iopub.status.idle": "2022-06-24T21:39:06.366232Z",
"shell.execute_reply": "2022-06-24T21:39:06.365364Z",
"shell.execute_reply.started": "2022-06-24T21:39:06.360409Z"
},
"tags": [] "tags": []
}, },
"outputs": [], "outputs": [
{
"data": {
"text/plain": [
"[{'url': 'https://upload.wikimedia.org/wikipedia/commons/2/27/Flag_of_Paraguay.svg',\n",
" 'path': 'files/flags/Flag_of_Paraguay.svg',\n",
" 'checksum': '8156f5f7586b7d92c0fb46aeda70ce68',\n",
" 'status': 'downloaded'}]"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"df[\"short_country_name\"]" "df.iloc[84][\"files\"]"
] ]
}, },
{ {