mirror of
https://github.com/dtomlinson91/street_group_tech_test
synced 2025-12-22 03:55:43 +00:00
adding latest beam pipeline code for dataflow with group optimisation
This commit is contained in:
@@ -85,7 +85,7 @@ class SplitColumn(beam.DoFn):
|
|||||||
yield element
|
yield element
|
||||||
|
|
||||||
|
|
||||||
class GenerateUniqueID(beam.DoFn):
|
class CreateMappingTable(beam.DoFn):
|
||||||
"""
|
"""
|
||||||
Generate a unique ID for the PCollection, either for all the columns or for the
|
Generate a unique ID for the PCollection, either for all the columns or for the
|
||||||
uniquely identifying data only.
|
uniquely identifying data only.
|
||||||
@@ -104,6 +104,49 @@ class GenerateUniqueID(beam.DoFn):
|
|||||||
yield new_element
|
yield new_element
|
||||||
|
|
||||||
|
|
||||||
|
# class CreateMappingTable(beam.DoFn):
|
||||||
|
# def process(self, element):
|
||||||
|
# unique_string = ",".join(element)
|
||||||
|
# hashed_string = hashlib.md5(unique_string.encode())
|
||||||
|
# new_element = {hashed_string.hexdigest(): list(element)}
|
||||||
|
# yield new_element
|
||||||
|
|
||||||
|
|
||||||
|
class CreateUniquePropertyID(beam.DoFn):
|
||||||
|
def process(self, element):
|
||||||
|
unique_string = ",".join(element[-1][2:])
|
||||||
|
hashed_string = hashlib.md5(unique_string.encode())
|
||||||
|
new_element = (hashed_string.hexdigest(), element[0])
|
||||||
|
yield new_element
|
||||||
|
|
||||||
|
|
||||||
|
class DeduplicateIDs(beam.DoFn):
|
||||||
|
def process(self, element):
|
||||||
|
deduplicated_list = list(set(element[-1]))
|
||||||
|
new_element = (element[0], deduplicated_list)
|
||||||
|
yield new_element
|
||||||
|
|
||||||
|
|
||||||
|
# class InsertDataForID(beam.DoFn):
|
||||||
|
# def __init__(self, mapping_table):
|
||||||
|
# self.mapping_table = mapping_table
|
||||||
|
|
||||||
|
# def process(self, element):
|
||||||
|
# replaced_list = [self.mapping_table[x] for x in element[-1]]
|
||||||
|
# new_element = (element[0], replaced_list)
|
||||||
|
# yield new_element
|
||||||
|
|
||||||
|
|
||||||
|
def insert_data_for_id(element, mapping_table):
|
||||||
|
replaced_list = []
|
||||||
|
for data_id in element[-1]:
|
||||||
|
replaced_list.append(mapping_table[data_id])
|
||||||
|
# replaced_list = [mapping_table[x] for x in element[-1]]
|
||||||
|
new_element = (element[0], replaced_list)
|
||||||
|
yield new_element
|
||||||
|
|
||||||
|
|
||||||
|
# old
|
||||||
class DeduplicateByID(beam.DoFn):
|
class DeduplicateByID(beam.DoFn):
|
||||||
"""
|
"""
|
||||||
If the PCollection has multiple entries after being grouped by ID for all columns,
|
If the PCollection has multiple entries after being grouped by ID for all columns,
|
||||||
@@ -280,37 +323,66 @@ def run(argv=None, save_main_session=True):
|
|||||||
| "Drop empty PAON if missing SAON"
|
| "Drop empty PAON if missing SAON"
|
||||||
>> beam.ParDo(DropRecordsTwoEmptyColumn(3, 4))
|
>> beam.ParDo(DropRecordsTwoEmptyColumn(3, 4))
|
||||||
# | beam.ParDo(DebugShowColumnWithValueIn(2, "B16 0AE"))
|
# | beam.ParDo(DebugShowColumnWithValueIn(2, "B16 0AE"))
|
||||||
|
# | beam.ParDo(DebugShowColumnWithValueIn(2, "B90 3LA"))
|
||||||
| "Split PAON into two columns if separated by comma"
|
| "Split PAON into two columns if separated by comma"
|
||||||
>> beam.ParDo(SplitColumn(3, ","))
|
>> beam.ParDo(SplitColumn(3, ","))
|
||||||
)
|
)
|
||||||
|
|
||||||
# Clean the data by creating an ID, and deduplicating to eliminate repeated rows.
|
# # Clean the data by creating an ID, and deduplicating to eliminate repeated rows.
|
||||||
clean_deduplicate = (
|
# clean_deduplicate = (
|
||||||
clean_drop
|
# clean_drop
|
||||||
| "Generate unique ID for all columns"
|
# | "Generate ID using all columns"
|
||||||
>> beam.ParDo(GenerateUniqueID(all_columns=True))
|
# >> beam.ParDo(GenerateUniqueID(all_columns=True))
|
||||||
| "Group by the ID for all columns"
|
# | "Group by the ID for all columns" >> beam.GroupByKey()
|
||||||
>> beam.GroupByKey()
|
# | "Deduplicate by the ID for all columns" >> beam.ParDo(DeduplicateByID())
|
||||||
| "Deduplicate by the ID for all columns" >> beam.ParDo(DeduplicateByID())
|
# )
|
||||||
)
|
|
||||||
|
|
||||||
# Prepare the data by generating an ID using the uniquely identifying
|
# Create a mapping table
|
||||||
# information only and grouping them by this ID.
|
mapping_table_raw = (
|
||||||
prepare = (
|
clean_drop
|
||||||
clean_deduplicate
|
| "Create a mapping table with key of id_all_columns and value of cleaned data."
|
||||||
| "Remove previous unique ID" >> beam.ParDo(RemoveUniqueID())
|
>> beam.ParDo(CreateMappingTable(all_columns=True))
|
||||||
| "Generate unique ID ignoring price & date"
|
|
||||||
>> beam.ParDo(GenerateUniqueID())
|
|
||||||
| "Group by the ID ignoring price & date"
|
|
||||||
>> beam.GroupByKey()
|
|
||||||
# | beam.Map(print)
|
# | beam.Map(print)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
mapping_table_condensed = (
|
||||||
|
mapping_table_raw
|
||||||
|
| "Condense mapping table into single dict" >> beam.combiners.ToDict()
|
||||||
|
# | beam.Map(print)
|
||||||
|
)
|
||||||
|
|
||||||
|
prepared = (
|
||||||
|
mapping_table_raw
|
||||||
|
| "Create unique ID ignoring price & date"
|
||||||
|
>> beam.ParDo(CreateUniquePropertyID())
|
||||||
|
| "Group IDs using all columns by IDs ignoring price & date"
|
||||||
|
>> beam.GroupByKey()
|
||||||
|
| "Deduplicate to eliminate repeated transactions"
|
||||||
|
>> beam.ParDo(DeduplicateIDs())
|
||||||
|
| "Insert the raw data using the mapping table"
|
||||||
|
>> beam.FlatMap(
|
||||||
|
insert_data_for_id, beam.pvalue.AsSingleton(mapping_table_condensed)
|
||||||
|
)
|
||||||
|
# | beam.Map(print)
|
||||||
|
)
|
||||||
|
|
||||||
|
# # Prepare the data by generating an ID using the uniquely identifying
|
||||||
|
# # information only and grouping them by this ID.
|
||||||
|
# prepare = (
|
||||||
|
# clean_deduplicate
|
||||||
|
# | "Remove previous unique ID" >> beam.ParDo(RemoveUniqueID())
|
||||||
|
# | "Generate unique ID ignoring price & date old"
|
||||||
|
# >> beam.ParDo(GenerateUniqueID())
|
||||||
|
# | "Group by the ID ignoring price & date" >> beam.GroupByKey()
|
||||||
|
# # | beam.Map(print)
|
||||||
|
# )
|
||||||
|
|
||||||
# Format the data into a dict.
|
# Format the data into a dict.
|
||||||
formatted = (
|
formatted = (
|
||||||
prepare
|
prepared
|
||||||
| "Convert the prepared data into a dict object"
|
| "Convert the prepared data into a dict object"
|
||||||
>> beam.ParDo(ConvertDataToDict())
|
>> beam.ParDo(ConvertDataToDict())
|
||||||
|
# | beam.Map(print)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Save the data to a .json file.
|
# Save the data to a .json file.
|
||||||
|
|||||||
Reference in New Issue
Block a user