chore: add remove citations to item loader
This commit is contained in:
@@ -1,12 +1,15 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
from itemloaders.processors import TakeFirst, MapCompose
|
from itemloaders.processors import TakeFirst, MapCompose
|
||||||
from scrapy.loader import ItemLoader
|
from scrapy.loader import ItemLoader
|
||||||
from w3lib.html import remove_tags
|
from w3lib.html import remove_tags
|
||||||
|
|
||||||
|
|
||||||
class FlagsItemLoader(ItemLoader):
|
class FlagsItemLoader(ItemLoader):
|
||||||
default_output_processor = TakeFirst()
|
default_output_processor = TakeFirst()
|
||||||
|
|
||||||
# country_name - convert "_" to " "
|
# country_name - convert "_" to " "
|
||||||
country_name_in = MapCompose(lambda country: country.replace("_", " "))
|
country_name_in = MapCompose(lambda country: country.replace("_", " "))
|
||||||
|
|
||||||
# flag_description - remove html tags
|
# flag_description - remove html tags, remove citations [1]
|
||||||
flag_description_in = MapCompose(remove_tags)
|
flag_description_in = MapCompose(remove_tags, lambda line: re.sub(r"(\[\d+\])", "", line))
|
||||||
|
|||||||
Reference in New Issue
Block a user