chore: change xml for flag description url

This commit is contained in:
2022-06-24 22:59:13 +01:00
parent fa26c99ba5
commit e865018fd9
3 changed files with 182 additions and 62 deletions

View File

@@ -8,7 +8,7 @@ import scrapy
class WikipediaCountryScraperItem(scrapy.Item):
country_url = scrapy.Field()
flag_image_url = scrapy.Field()
flag_description_url = scrapy.Field()
short_country_name = scrapy.Field()
country_html = scrapy.Field()
flag_html = scrapy.Field()

View File

@@ -50,14 +50,9 @@ class CountrydownloaderSpider(scrapy.Spider):
"//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[1]/a/@href"
).get()
try:
flag_description_url = response.xpath(
"//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div/a/@href"
).getall()[-1]
except IndexError:
flag_description_url = response.xpath(
"//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div/a/@href"
).get()
flag_description_url = response.xpath(
"//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div/a[not(contains(@href, 'cite_note'))]/@href"
).getall()[-1]
country_item = {
**country_item,
@@ -71,6 +66,7 @@ class CountrydownloaderSpider(scrapy.Spider):
"country_item": country_item,
"urls": {
"flag_image_url": f"https://en.wikipedia.org{flag_image_url}",
"flag_description_url": f"https://en.wikipedia.org{flag_description_url}",
},
},
)
@@ -103,7 +99,7 @@ class CountrydownloaderSpider(scrapy.Spider):
# yield the country item containing scraped data
country_scrapy_item = WikipediaCountryScraperItem()
country_scrapy_item["country_url"] = country_item["country_url"]
country_scrapy_item["flag_image_url"] = urls["flag_image_url"]
country_scrapy_item["flag_description_url"] = urls["flag_description_url"]
country_scrapy_item["short_country_name"] = country_item["short_country_name"]
country_scrapy_item["country_html"] = country_item["country_html"]
country_scrapy_item["flag_html"] = country_item["flag_html"]