From 55e4ff77228fd5c16f29ad4f9af2c72d49572e63 Mon Sep 17 00:00:00 2001 From: Daniel Tomlinson Date: Sun, 26 Jun 2022 22:26:49 +0100 Subject: [PATCH] chore: add remove citations to item loader --- .../wikipedia_country_scraper/itemloaders.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/itemloaders.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/itemloaders.py index 5a8198d..06f2489 100644 --- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/itemloaders.py +++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/itemloaders.py @@ -1,12 +1,15 @@ +import re + from itemloaders.processors import TakeFirst, MapCompose from scrapy.loader import ItemLoader from w3lib.html import remove_tags + class FlagsItemLoader(ItemLoader): default_output_processor = TakeFirst() # country_name - convert "_" to " " country_name_in = MapCompose(lambda country: country.replace("_", " ")) - # flag_description - remove html tags - flag_description_in = MapCompose(remove_tags) + # flag_description - remove html tags, remove citations [1] + flag_description_in = MapCompose(remove_tags, lambda line: re.sub(r"(\[\d+\])", "", line))