chore: add remove citations to item loader

This commit is contained in:
2022-06-26 22:26:49 +01:00
parent 6fcae89c5d
commit 55e4ff7722

View File

@@ -1,12 +1,15 @@
import re
from itemloaders.processors import TakeFirst, MapCompose
from scrapy.loader import ItemLoader
from w3lib.html import remove_tags
class FlagsItemLoader(ItemLoader):
default_output_processor = TakeFirst()
# country_name - convert "_" to " "
country_name_in = MapCompose(lambda country: country.replace("_", " "))
# flag_description - remove html tags
flag_description_in = MapCompose(remove_tags)
# flag_description - remove html tags, remove citations [1]
flag_description_in = MapCompose(remove_tags, lambda line: re.sub(r"(\[\d+\])", "", line))