From 5cc9297a01ad2d28419ad8a91f084cffbb023afe Mon Sep 17 00:00:00 2001 From: Daniel Tomlinson Date: Sun, 26 Jun 2022 23:57:07 +0100 Subject: [PATCH] chore: update filter middleware --- .../wikipedia_country_scraper/middlewares.py | 6 ++++++ .../wikipedia_country_scraper/pipelines.py | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/middlewares.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/middlewares.py index c962064..5ae9a46 100644 --- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/middlewares.py +++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/middlewares.py @@ -105,5 +105,11 @@ class WikipediaCountryScraperDownloaderMiddleware: class WikipediaCountryScraperDecodeURLMiddleware: def process_request(self, request, spider): + # https://www.w3schools.com/tags/ref_urlencode.ASP + request._url = request.url.replace("%27", "'") request._url = request.url.replace("%28", "(") request._url = request.url.replace("%29", ")") + request._url = request.url.replace("%C3%A7", "ç") + request._url = request.url.replace("%C3%B1", "ñ") + request._url = request.url.replace("%C3%B4", "ô") + request._url = request.url.replace("%C3%85", "Å") diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/pipelines.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/pipelines.py index 9057f6a..e306cd0 100644 --- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/pipelines.py +++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/pipelines.py @@ -3,7 +3,6 @@ # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html -import urllib import re # useful for handling different item types with a single interface