diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/itemloaders.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/itemloaders.py index 5a8198d..06f2489 100644 --- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/itemloaders.py +++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/itemloaders.py @@ -1,12 +1,15 @@ +import re + from itemloaders.processors import TakeFirst, MapCompose from scrapy.loader import ItemLoader from w3lib.html import remove_tags + class FlagsItemLoader(ItemLoader): default_output_processor = TakeFirst() # country_name - convert "_" to " " country_name_in = MapCompose(lambda country: country.replace("_", " ")) - # flag_description - remove html tags - flag_description_in = MapCompose(remove_tags) + # flag_description - remove html tags, remove citations [1] + flag_description_in = MapCompose(remove_tags, lambda line: re.sub(r"(\[\d+\])", "", line))