chore: change xml for flag description url

This commit is contained in:
2022-06-24 22:59:13 +01:00
parent fa26c99ba5
commit e865018fd9
3 changed files with 182 additions and 62 deletions

View File

@@ -8,7 +8,7 @@ import scrapy
class WikipediaCountryScraperItem(scrapy.Item): class WikipediaCountryScraperItem(scrapy.Item):
country_url = scrapy.Field() country_url = scrapy.Field()
flag_image_url = scrapy.Field() flag_description_url = scrapy.Field()
short_country_name = scrapy.Field() short_country_name = scrapy.Field()
country_html = scrapy.Field() country_html = scrapy.Field()
flag_html = scrapy.Field() flag_html = scrapy.Field()

View File

@@ -50,14 +50,9 @@ class CountrydownloaderSpider(scrapy.Spider):
"//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[1]/a/@href" "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[1]/a/@href"
).get() ).get()
try:
flag_description_url = response.xpath( flag_description_url = response.xpath(
"//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div/a/@href" "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div/a[not(contains(@href, 'cite_note'))]/@href"
).getall()[-1] ).getall()[-1]
except IndexError:
flag_description_url = response.xpath(
"//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div/a/@href"
).get()
country_item = { country_item = {
**country_item, **country_item,
@@ -71,6 +66,7 @@ class CountrydownloaderSpider(scrapy.Spider):
"country_item": country_item, "country_item": country_item,
"urls": { "urls": {
"flag_image_url": f"https://en.wikipedia.org{flag_image_url}", "flag_image_url": f"https://en.wikipedia.org{flag_image_url}",
"flag_description_url": f"https://en.wikipedia.org{flag_description_url}",
}, },
}, },
) )
@@ -103,7 +99,7 @@ class CountrydownloaderSpider(scrapy.Spider):
# yield the country item containing scraped data # yield the country item containing scraped data
country_scrapy_item = WikipediaCountryScraperItem() country_scrapy_item = WikipediaCountryScraperItem()
country_scrapy_item["country_url"] = country_item["country_url"] country_scrapy_item["country_url"] = country_item["country_url"]
country_scrapy_item["flag_image_url"] = urls["flag_image_url"] country_scrapy_item["flag_description_url"] = urls["flag_description_url"]
country_scrapy_item["short_country_name"] = country_item["short_country_name"] country_scrapy_item["short_country_name"] = country_item["short_country_name"]
country_scrapy_item["country_html"] = country_item["country_html"] country_scrapy_item["country_html"] = country_item["country_html"]
country_scrapy_item["flag_html"] = country_item["flag_html"] country_scrapy_item["flag_html"] = country_item["flag_html"]

View File

@@ -79,15 +79,15 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 41, "execution_count": 74,
"id": "d03be94e-8642-4916-8a43-1711e0c21b36", "id": "d03be94e-8642-4916-8a43-1711e0c21b36",
"metadata": { "metadata": {
"execution": { "execution": {
"iopub.execute_input": "2022-06-24T21:32:00.473759Z", "iopub.execute_input": "2022-06-24T21:52:35.726961Z",
"iopub.status.busy": "2022-06-24T21:32:00.473129Z", "iopub.status.busy": "2022-06-24T21:52:35.726356Z",
"iopub.status.idle": "2022-06-24T21:32:00.812851Z", "iopub.status.idle": "2022-06-24T21:52:35.823456Z",
"shell.execute_reply": "2022-06-24T21:32:00.812131Z", "shell.execute_reply": "2022-06-24T21:52:35.822464Z",
"shell.execute_reply.started": "2022-06-24T21:32:00.473730Z" "shell.execute_reply.started": "2022-06-24T21:52:35.726932Z"
}, },
"tags": [] "tags": []
}, },
@@ -100,15 +100,15 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 42, "execution_count": 75,
"id": "29cca9ea-16d3-4534-8c9e-49fde37f8cdd", "id": "29cca9ea-16d3-4534-8c9e-49fde37f8cdd",
"metadata": { "metadata": {
"execution": { "execution": {
"iopub.execute_input": "2022-06-24T21:32:01.728669Z", "iopub.execute_input": "2022-06-24T21:52:36.453655Z",
"iopub.status.busy": "2022-06-24T21:32:01.728021Z", "iopub.status.busy": "2022-06-24T21:52:36.452202Z",
"iopub.status.idle": "2022-06-24T21:32:01.919644Z", "iopub.status.idle": "2022-06-24T21:52:36.555461Z",
"shell.execute_reply": "2022-06-24T21:32:01.917919Z", "shell.execute_reply": "2022-06-24T21:52:36.554096Z",
"shell.execute_reply.started": "2022-06-24T21:32:01.728629Z" "shell.execute_reply.started": "2022-06-24T21:52:36.453559Z"
}, },
"tags": [] "tags": []
}, },
@@ -119,15 +119,15 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 43, "execution_count": 76,
"id": "ef8bc3ce-08dd-4260-807c-2616b2e1c1ba", "id": "ef8bc3ce-08dd-4260-807c-2616b2e1c1ba",
"metadata": { "metadata": {
"execution": { "execution": {
"iopub.execute_input": "2022-06-24T21:32:04.251701Z", "iopub.execute_input": "2022-06-24T21:52:36.950435Z",
"iopub.status.busy": "2022-06-24T21:32:04.250831Z", "iopub.status.busy": "2022-06-24T21:52:36.949946Z",
"iopub.status.idle": "2022-06-24T21:32:04.263351Z", "iopub.status.idle": "2022-06-24T21:52:36.959935Z",
"shell.execute_reply": "2022-06-24T21:32:04.262556Z", "shell.execute_reply": "2022-06-24T21:52:36.958581Z",
"shell.execute_reply.started": "2022-06-24T21:32:04.251670Z" "shell.execute_reply.started": "2022-06-24T21:52:36.950398Z"
}, },
"tags": [] "tags": []
}, },
@@ -151,15 +151,15 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 44, "execution_count": 77,
"id": "48db8f93-659b-45a4-8477-a7cec139bebc", "id": "48db8f93-659b-45a4-8477-a7cec139bebc",
"metadata": { "metadata": {
"execution": { "execution": {
"iopub.execute_input": "2022-06-24T21:32:05.191313Z", "iopub.execute_input": "2022-06-24T21:52:37.635010Z",
"iopub.status.busy": "2022-06-24T21:32:05.190427Z", "iopub.status.busy": "2022-06-24T21:52:37.634417Z",
"iopub.status.idle": "2022-06-24T21:32:05.233768Z", "iopub.status.idle": "2022-06-24T21:52:37.645162Z",
"shell.execute_reply": "2022-06-24T21:32:05.232310Z", "shell.execute_reply": "2022-06-24T21:52:37.643796Z",
"shell.execute_reply.started": "2022-06-24T21:32:05.191232Z" "shell.execute_reply.started": "2022-06-24T21:52:37.634953Z"
}, },
"tags": [] "tags": []
}, },
@@ -183,15 +183,15 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 58, "execution_count": 81,
"id": "a52f6aa2-5bbd-46e4-9b2f-cdbd7269cb6e", "id": "a52f6aa2-5bbd-46e4-9b2f-cdbd7269cb6e",
"metadata": { "metadata": {
"execution": { "execution": {
"iopub.execute_input": "2022-06-24T21:36:11.684046Z", "iopub.execute_input": "2022-06-24T21:52:49.070882Z",
"iopub.status.busy": "2022-06-24T21:36:11.683658Z", "iopub.status.busy": "2022-06-24T21:52:49.070107Z",
"iopub.status.idle": "2022-06-24T21:36:11.690174Z", "iopub.status.idle": "2022-06-24T21:52:49.076033Z",
"shell.execute_reply": "2022-06-24T21:36:11.689279Z", "shell.execute_reply": "2022-06-24T21:52:49.075243Z",
"shell.execute_reply.started": "2022-06-24T21:36:11.684015Z" "shell.execute_reply.started": "2022-06-24T21:52:49.070853Z"
}, },
"tags": [] "tags": []
}, },
@@ -215,15 +215,15 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 59, "execution_count": 84,
"id": "643e6512-1e5b-4eb2-9f0a-6b680ada787b", "id": "643e6512-1e5b-4eb2-9f0a-6b680ada787b",
"metadata": { "metadata": {
"execution": { "execution": {
"iopub.execute_input": "2022-06-24T21:36:13.608192Z", "iopub.execute_input": "2022-06-24T21:53:15.797108Z",
"iopub.status.busy": "2022-06-24T21:36:13.607778Z", "iopub.status.busy": "2022-06-24T21:53:15.796761Z",
"iopub.status.idle": "2022-06-24T21:36:13.623089Z", "iopub.status.idle": "2022-06-24T21:53:15.809904Z",
"shell.execute_reply": "2022-06-24T21:36:13.622311Z", "shell.execute_reply": "2022-06-24T21:53:15.809157Z",
"shell.execute_reply.started": "2022-06-24T21:36:13.608162Z" "shell.execute_reply.started": "2022-06-24T21:53:15.797079Z"
}, },
"tags": [] "tags": []
}, },
@@ -250,6 +250,7 @@
" <tr style=\"text-align: right;\">\n", " <tr style=\"text-align: right;\">\n",
" <th></th>\n", " <th></th>\n",
" <th>country_url</th>\n", " <th>country_url</th>\n",
" <th>flag_image_url</th>\n",
" <th>short_country_name</th>\n", " <th>short_country_name</th>\n",
" <th>country_html</th>\n", " <th>country_html</th>\n",
" <th>flag_html</th>\n", " <th>flag_html</th>\n",
@@ -261,6 +262,7 @@
" <tr>\n", " <tr>\n",
" <th>84</th>\n", " <th>84</th>\n",
" <td>https://en.wikipedia.org/wiki/Paraguay</td>\n", " <td>https://en.wikipedia.org/wiki/Paraguay</td>\n",
" <td>https://en.wikipedia.org/wiki/File:Flag_of_Par...</td>\n",
" <td>Paraguay</td>\n", " <td>Paraguay</td>\n",
" <td>[&lt;tr&gt;&lt;th colspan=\"2\" class=\"infobox-above adr\"...</td>\n", " <td>[&lt;tr&gt;&lt;th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
" <td>None</td>\n", " <td>None</td>\n",
@@ -272,8 +274,11 @@
"</div>" "</div>"
], ],
"text/plain": [ "text/plain": [
" country_url short_country_name \\\n", " country_url \\\n",
"84 https://en.wikipedia.org/wiki/Paraguay Paraguay \n", "84 https://en.wikipedia.org/wiki/Paraguay \n",
"\n",
" flag_image_url short_country_name \\\n",
"84 https://en.wikipedia.org/wiki/File:Flag_of_Par... Paraguay \n",
"\n", "\n",
" country_html flag_html \\\n", " country_html flag_html \\\n",
"84 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... None \n", "84 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... None \n",
@@ -285,7 +290,7 @@
"84 [{'url': 'https://upload.wikimedia.org/wikiped... " "84 [{'url': 'https://upload.wikimedia.org/wikiped... "
] ]
}, },
"execution_count": 59, "execution_count": 84,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@@ -296,15 +301,100 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 64, "execution_count": 85,
"id": "5e21e98a-56ba-4e55-b5d4-89dab2232c29",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T21:53:31.685434Z",
"iopub.status.busy": "2022-06-24T21:53:31.684830Z",
"iopub.status.idle": "2022-06-24T21:53:31.697841Z",
"shell.execute_reply": "2022-06-24T21:53:31.697000Z",
"shell.execute_reply.started": "2022-06-24T21:53:31.685404Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>country_url</th>\n",
" <th>flag_image_url</th>\n",
" <th>short_country_name</th>\n",
" <th>country_html</th>\n",
" <th>flag_html</th>\n",
" <th>file_urls</th>\n",
" <th>files</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>84</th>\n",
" <td>https://en.wikipedia.org/wiki/Paraguay</td>\n",
" <td>https://en.wikipedia.org/wiki/File:Flag_of_Par...</td>\n",
" <td>Paraguay</td>\n",
" <td>[&lt;tr&gt;&lt;th colspan=\"2\" class=\"infobox-above adr\"...</td>\n",
" <td>None</td>\n",
" <td>[https:////upload.wikimedia.org/wikipedia/comm...</td>\n",
" <td>[{'url': 'https://upload.wikimedia.org/wikiped...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" country_url \\\n",
"84 https://en.wikipedia.org/wiki/Paraguay \n",
"\n",
" flag_image_url short_country_name \\\n",
"84 https://en.wikipedia.org/wiki/File:Flag_of_Par... Paraguay \n",
"\n",
" country_html flag_html \\\n",
"84 [<tr><th colspan=\"2\" class=\"infobox-above adr\"... None \n",
"\n",
" file_urls \\\n",
"84 [https:////upload.wikimedia.org/wikipedia/comm... \n",
"\n",
" files \n",
"84 [{'url': 'https://upload.wikimedia.org/wikiped... "
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df[\"flag_html\"].isna()]"
]
},
{
"cell_type": "code",
"execution_count": 89,
"id": "227b0c76-9e45-4849-849e-36355976cba9", "id": "227b0c76-9e45-4849-849e-36355976cba9",
"metadata": { "metadata": {
"execution": { "execution": {
"iopub.execute_input": "2022-06-24T21:39:06.360439Z", "iopub.execute_input": "2022-06-24T21:54:13.588753Z",
"iopub.status.busy": "2022-06-24T21:39:06.360082Z", "iopub.status.busy": "2022-06-24T21:54:13.588402Z",
"iopub.status.idle": "2022-06-24T21:39:06.366232Z", "iopub.status.idle": "2022-06-24T21:54:13.594182Z",
"shell.execute_reply": "2022-06-24T21:39:06.365364Z", "shell.execute_reply": "2022-06-24T21:54:13.593418Z",
"shell.execute_reply.started": "2022-06-24T21:39:06.360409Z" "shell.execute_reply.started": "2022-06-24T21:54:13.588723Z"
}, },
"tags": [] "tags": []
}, },
@@ -312,25 +402,59 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"[{'url': 'https://upload.wikimedia.org/wikipedia/commons/2/27/Flag_of_Paraguay.svg',\n", "'https://en.wikipedia.org/wiki/File:Flag_of_Paraguay.svg'"
" 'path': 'files/flags/Flag_of_Paraguay.svg',\n",
" 'checksum': '8156f5f7586b7d92c0fb46aeda70ce68',\n",
" 'status': 'downloaded'}]"
] ]
}, },
"execution_count": 64, "execution_count": 89,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"df.iloc[84][\"files\"]" "df.iloc[84][\"flag_image_url\"]"
]
},
{
"cell_type": "code",
"execution_count": 87,
"id": "f7712d7d-9074-4fc5-89f2-6e5f47c57d20",
"metadata": {
"execution": {
"iopub.execute_input": "2022-06-24T21:53:55.941453Z",
"iopub.status.busy": "2022-06-24T21:53:55.940811Z",
"iopub.status.idle": "2022-06-24T21:53:55.947972Z",
"shell.execute_reply": "2022-06-24T21:53:55.947077Z",
"shell.execute_reply.started": "2022-06-24T21:53:55.941423Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"206\n"
]
},
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(len([file for file in df.files if len(file) != 0]))\n",
"[file for file in df.files if len(file) == 0]"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "f7712d7d-9074-4fc5-89f2-6e5f47c57d20", "id": "d7e60156-1ee5-4bf9-ab9a-d529ee988301",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [] "source": []