chore: try alternative flag spider

This commit is contained in:
2022-06-26 17:46:04 +01:00
parent b5eec4550d
commit 44a8365c53
2 changed files with 4496 additions and 19 deletions

View File

@@ -5,6 +5,9 @@ from scrapy.http import TextResponse
from wikipedia_country_scraper.items import FlagsItem
EXCLUDED = ["Réunion"]
class FlagsSpider(scrapy.Spider):
name = "FlagsSpider"
start_urls = [
@@ -30,46 +33,55 @@ class FlagsSpider(scrapy.Spider):
]
for country_name, country_url in zip(country_names, country_urls):
yield scrapy.Request(
url=country_url, callback=self.get_country_page, cb_kwargs={"country_name": country_name}
)
if country_name not in EXCLUDED:
yield scrapy.Request(
url=country_url, callback=self.get_country_page, cb_kwargs={"country_name": country_name}
)
def get_country_page(self, response: TextResponse, country_name: str):
flag_data = response.xpath(
"//table[contains(@class, 'infobox')]/tbody/tr/td/div/div/div/a[not(contains(@href, 'cite_note'))]/@href"
).getall()
flag_image_url = flag_data[0]
flag_description_url = flag_data[1]
try:
flag_image_url = flag_data[0]
flag_description_url = flag_data[1]
except IndexError:
flag_image_data = response.xpath(
"//table[contains(@class, 'infobox')]/tbody/tr/td/div/div/a[not(contains(@href, 'cite_note'))]/@href"
).getall()
flag_description_data = response.xpath(
"//table[contains(@class, 'infobox')]/tbody/tr/td/div/a[not(contains(@href, 'cite_note'))]/@href"
).getall()
flag_image_url = flag_image_data[0]
flag_description_url = flag_description_data[1]
yield scrapy.Request(
url=f"https://en.wikipedia.org{flag_description_url}",
callback=self.get_flag_description,
cb_kwargs={"country_name": country_name},
cb_kwargs={
"country_name": country_name,
"urls": {"flag_image_url": f"https://en.wikipedia.org{flag_image_url}"},
},
)
yield scrapy.Request(
url=f"https://en.wikipedia.org{flag_image_url}",
callback=self.get_flag_image,
cb_kwargs={"country_name": country_name},
)
def get_flag_description(self, response: TextResponse, country_name: str):
def get_flag_description(self, response: TextResponse, country_name: str, urls: dict):
flag_description_result = response.xpath(
"//div[contains(@id, 'mw-content-text')]/div/p[not(contains(@class, 'mw-empty-elt'))]"
).get()
flags_item = FlagsItem()
flags_item["country_name"] = country_name
flags_item["flag_description_html"] = flag_description_result
yield scrapy.Request(
url=urls["flag_image_url"],
callback=self.get_flag_image,
cb_kwargs={"country_name": country_name, "flag_description_html": flag_description_result},
)
yield flags_item
def get_flag_image(self, response: TextResponse, country_name: str):
def get_flag_image(self, response: TextResponse, country_name: str, flag_description_html: str):
flag_image_result = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get()
flags_item = FlagsItem()
flags_item["country_name"] = country_name
flags_item["flag_description_html"] = flag_description_html
flags_item["file_urls"] = [f"https:{flag_image_result}"]
yield flags_item

File diff suppressed because it is too large Load Diff