chore: try alternative flag spider
This commit is contained in:
@@ -5,6 +5,9 @@ from scrapy.http import TextResponse
|
||||
from wikipedia_country_scraper.items import FlagsItem
|
||||
|
||||
|
||||
EXCLUDED = ["Réunion"]
|
||||
|
||||
|
||||
class FlagsSpider(scrapy.Spider):
|
||||
name = "FlagsSpider"
|
||||
start_urls = [
|
||||
@@ -30,46 +33,55 @@ class FlagsSpider(scrapy.Spider):
|
||||
]
|
||||
|
||||
for country_name, country_url in zip(country_names, country_urls):
|
||||
yield scrapy.Request(
|
||||
url=country_url, callback=self.get_country_page, cb_kwargs={"country_name": country_name}
|
||||
)
|
||||
if country_name not in EXCLUDED:
|
||||
yield scrapy.Request(
|
||||
url=country_url, callback=self.get_country_page, cb_kwargs={"country_name": country_name}
|
||||
)
|
||||
|
||||
def get_country_page(self, response: TextResponse, country_name: str):
|
||||
flag_data = response.xpath(
|
||||
"//table[contains(@class, 'infobox')]/tbody/tr/td/div/div/div/a[not(contains(@href, 'cite_note'))]/@href"
|
||||
).getall()
|
||||
|
||||
flag_image_url = flag_data[0]
|
||||
flag_description_url = flag_data[1]
|
||||
try:
|
||||
flag_image_url = flag_data[0]
|
||||
flag_description_url = flag_data[1]
|
||||
except IndexError:
|
||||
flag_image_data = response.xpath(
|
||||
"//table[contains(@class, 'infobox')]/tbody/tr/td/div/div/a[not(contains(@href, 'cite_note'))]/@href"
|
||||
).getall()
|
||||
flag_description_data = response.xpath(
|
||||
"//table[contains(@class, 'infobox')]/tbody/tr/td/div/a[not(contains(@href, 'cite_note'))]/@href"
|
||||
).getall()
|
||||
flag_image_url = flag_image_data[0]
|
||||
flag_description_url = flag_description_data[1]
|
||||
|
||||
yield scrapy.Request(
|
||||
url=f"https://en.wikipedia.org{flag_description_url}",
|
||||
callback=self.get_flag_description,
|
||||
cb_kwargs={"country_name": country_name},
|
||||
cb_kwargs={
|
||||
"country_name": country_name,
|
||||
"urls": {"flag_image_url": f"https://en.wikipedia.org{flag_image_url}"},
|
||||
},
|
||||
)
|
||||
|
||||
yield scrapy.Request(
|
||||
url=f"https://en.wikipedia.org{flag_image_url}",
|
||||
callback=self.get_flag_image,
|
||||
cb_kwargs={"country_name": country_name},
|
||||
)
|
||||
|
||||
def get_flag_description(self, response: TextResponse, country_name: str):
|
||||
def get_flag_description(self, response: TextResponse, country_name: str, urls: dict):
|
||||
flag_description_result = response.xpath(
|
||||
"//div[contains(@id, 'mw-content-text')]/div/p[not(contains(@class, 'mw-empty-elt'))]"
|
||||
).get()
|
||||
|
||||
flags_item = FlagsItem()
|
||||
flags_item["country_name"] = country_name
|
||||
flags_item["flag_description_html"] = flag_description_result
|
||||
yield scrapy.Request(
|
||||
url=urls["flag_image_url"],
|
||||
callback=self.get_flag_image,
|
||||
cb_kwargs={"country_name": country_name, "flag_description_html": flag_description_result},
|
||||
)
|
||||
|
||||
yield flags_item
|
||||
|
||||
def get_flag_image(self, response: TextResponse, country_name: str):
|
||||
def get_flag_image(self, response: TextResponse, country_name: str, flag_description_html: str):
|
||||
flag_image_result = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get()
|
||||
|
||||
flags_item = FlagsItem()
|
||||
flags_item["country_name"] = country_name
|
||||
flags_item["flag_description_html"] = flag_description_html
|
||||
flags_item["file_urls"] = [f"https:{flag_image_result}"]
|
||||
|
||||
yield flags_item
|
||||
|
||||
4465
playground/downloaded_data_inspection_lab/flags.json
Normal file
4465
playground/downloaded_data_inspection_lab/flags.json
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user