mirror of
https://github.com/dtomlinson91/street_group_tech_test
synced 2025-12-22 03:55:43 +00:00
adding latest beam pipeline code for dataflow with group optimisation
This commit is contained in:
@@ -9,8 +9,6 @@ import pathlib
|
|||||||
import apache_beam as beam
|
import apache_beam as beam
|
||||||
from apache_beam.options.pipeline_options import PipelineOptions, SetupOptions
|
from apache_beam.options.pipeline_options import PipelineOptions, SetupOptions
|
||||||
|
|
||||||
# from analyse_properties.debug import DebugShowEmptyColumn, DebugShowColumnWithValueIn
|
|
||||||
|
|
||||||
|
|
||||||
def slice_by_range(element, *ranges):
|
def slice_by_range(element, *ranges):
|
||||||
"""
|
"""
|
||||||
@@ -28,15 +26,11 @@ def slice_by_range(element, *ranges):
|
|||||||
|
|
||||||
|
|
||||||
class DropRecordsSingleEmptyColumn(beam.DoFn):
|
class DropRecordsSingleEmptyColumn(beam.DoFn):
|
||||||
def __init__(self, index):
|
|
||||||
self.index = index
|
|
||||||
|
|
||||||
def process(self, element):
|
|
||||||
"""
|
"""
|
||||||
Drop the entire row if a given column is empty.
|
Drop the entire row if a given column is empty.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
element : The element
|
index : The index of the column in the list.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
None: If the length of the column is 0, drop the element.
|
None: If the length of the column is 0, drop the element.
|
||||||
@@ -44,6 +38,11 @@ class DropRecordsSingleEmptyColumn(beam.DoFn):
|
|||||||
Yields:
|
Yields:
|
||||||
element: If the length of the column isn't 0, keep the element.
|
element: If the length of the column isn't 0, keep the element.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(self, index):
|
||||||
|
self.index = index
|
||||||
|
|
||||||
|
def process(self, element):
|
||||||
column = element[self.index]
|
column = element[self.index]
|
||||||
if len(column) == 0:
|
if len(column) == 0:
|
||||||
return None
|
return None
|
||||||
@@ -51,7 +50,19 @@ class DropRecordsSingleEmptyColumn(beam.DoFn):
|
|||||||
|
|
||||||
|
|
||||||
class DropRecordsTwoEmptyColumn(beam.DoFn):
|
class DropRecordsTwoEmptyColumn(beam.DoFn):
|
||||||
"""If two given items in a list are both empty, drop this entry from the PCollection."""
|
"""
|
||||||
|
Drop the entire row if both of two given columns are empty.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
index_0 : The index of the first column in the list.
|
||||||
|
index_1 : The index of the second column in the list.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None: If the length of both columns is 0, drop the element.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
element: If the length of both columns isn't 0, keep the element.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, index_0, index_1):
|
def __init__(self, index_0, index_1):
|
||||||
self.index_0 = index_0
|
self.index_0 = index_0
|
||||||
@@ -66,7 +77,14 @@ class DropRecordsTwoEmptyColumn(beam.DoFn):
|
|||||||
|
|
||||||
|
|
||||||
class SplitColumn(beam.DoFn):
|
class SplitColumn(beam.DoFn):
|
||||||
"""Split an item in a list into two separate items in the PCollection."""
|
"""
|
||||||
|
Split one column into two columns by a character.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
index : The index of the column in the list.
|
||||||
|
split_char: The character to split the column by.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, index, split_char):
|
def __init__(self, index, split_char):
|
||||||
self.index = index
|
self.index = index
|
||||||
@@ -74,7 +92,7 @@ class SplitColumn(beam.DoFn):
|
|||||||
|
|
||||||
def process(self, element):
|
def process(self, element):
|
||||||
# If there is a split based on the split_char, then keep the first result in
|
# If there is a split based on the split_char, then keep the first result in
|
||||||
# place and append the second.
|
# place and append the second column at the end.
|
||||||
try:
|
try:
|
||||||
part_0, part_1 = element[self.index].split(self.split_char)
|
part_0, part_1 = element[self.index].split(self.split_char)
|
||||||
element[self.index] = part_1.strip()
|
element[self.index] = part_1.strip()
|
||||||
@@ -87,8 +105,16 @@ class SplitColumn(beam.DoFn):
|
|||||||
|
|
||||||
class CreateMappingTable(beam.DoFn):
|
class CreateMappingTable(beam.DoFn):
|
||||||
"""
|
"""
|
||||||
Generate a unique ID for the PCollection, either for all the columns or for the
|
Create a mapping table to be used as a side-input.
|
||||||
uniquely identifying data only.
|
|
||||||
|
This mapping table has a key of an id generated across all columns and a value of
|
||||||
|
the raw property data.
|
||||||
|
|
||||||
|
The table is used to populate the raw property data after a GroupByKey which is done on ids only
|
||||||
|
in order to reduce the amount of data processed in the GroupByKey operation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
all_columns
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, all_columns=False):
|
def __init__(self, all_columns=False):
|
||||||
@@ -104,14 +130,6 @@ class CreateMappingTable(beam.DoFn):
|
|||||||
yield new_element
|
yield new_element
|
||||||
|
|
||||||
|
|
||||||
# class CreateMappingTable(beam.DoFn):
|
|
||||||
# def process(self, element):
|
|
||||||
# unique_string = ",".join(element)
|
|
||||||
# hashed_string = hashlib.md5(unique_string.encode())
|
|
||||||
# new_element = {hashed_string.hexdigest(): list(element)}
|
|
||||||
# yield new_element
|
|
||||||
|
|
||||||
|
|
||||||
class CreateUniquePropertyID(beam.DoFn):
|
class CreateUniquePropertyID(beam.DoFn):
|
||||||
def process(self, element):
|
def process(self, element):
|
||||||
unique_string = ",".join(element[-1][2:])
|
unique_string = ",".join(element[-1][2:])
|
||||||
@@ -127,48 +145,12 @@ class DeduplicateIDs(beam.DoFn):
|
|||||||
yield new_element
|
yield new_element
|
||||||
|
|
||||||
|
|
||||||
# class InsertDataForID(beam.DoFn):
|
|
||||||
# def __init__(self, mapping_table):
|
|
||||||
# self.mapping_table = mapping_table
|
|
||||||
|
|
||||||
# def process(self, element):
|
|
||||||
# replaced_list = [self.mapping_table[x] for x in element[-1]]
|
|
||||||
# new_element = (element[0], replaced_list)
|
|
||||||
# yield new_element
|
|
||||||
|
|
||||||
|
|
||||||
def insert_data_for_id(element, mapping_table):
|
def insert_data_for_id(element, mapping_table):
|
||||||
replaced_list = []
|
replaced_list = [mapping_table[data_id] for data_id in element[-1]]
|
||||||
for data_id in element[-1]:
|
|
||||||
replaced_list.append(mapping_table[data_id])
|
|
||||||
# replaced_list = [mapping_table[x] for x in element[-1]]
|
|
||||||
new_element = (element[0], replaced_list)
|
new_element = (element[0], replaced_list)
|
||||||
yield new_element
|
yield new_element
|
||||||
|
|
||||||
|
|
||||||
# old
|
|
||||||
class DeduplicateByID(beam.DoFn):
|
|
||||||
"""
|
|
||||||
If the PCollection has multiple entries after being grouped by ID for all columns,
|
|
||||||
deduplicate the list to keep only one.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def process(self, element):
|
|
||||||
if len(list(element[1])) > 0:
|
|
||||||
deduplicated_element = (element[0], [list(element[1])[0]])
|
|
||||||
yield deduplicated_element
|
|
||||||
else:
|
|
||||||
yield element
|
|
||||||
|
|
||||||
|
|
||||||
class RemoveUniqueID(beam.DoFn):
|
|
||||||
"""Remove the unique ID from the PCollection, transforming it back into a list."""
|
|
||||||
|
|
||||||
def process(self, element):
|
|
||||||
element_no_id = element[-1][0]
|
|
||||||
yield element_no_id
|
|
||||||
|
|
||||||
|
|
||||||
class ConvertDataToDict(beam.DoFn):
|
class ConvertDataToDict(beam.DoFn):
|
||||||
"""Convert the processed data into a dict to be exported as a JSON object."""
|
"""Convert the processed data into a dict to be exported as a JSON object."""
|
||||||
|
|
||||||
@@ -232,7 +214,7 @@ class ConvertDataToDict(beam.DoFn):
|
|||||||
# Create the dict to hold all the information about the property.
|
# Create the dict to hold all the information about the property.
|
||||||
json_object = {
|
json_object = {
|
||||||
"property_id": element[0],
|
"property_id": element[0],
|
||||||
# "readable_address": None,
|
"readable_address": None,
|
||||||
"flat_appartment": list(element[-1])[0][4],
|
"flat_appartment": list(element[-1])[0][4],
|
||||||
"builing": list(element[-1])[0][10],
|
"builing": list(element[-1])[0][10],
|
||||||
"number": list(element[-1])[0][3],
|
"number": list(element[-1])[0][3],
|
||||||
@@ -252,20 +234,20 @@ class ConvertDataToDict(beam.DoFn):
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Create a human readable address to go in the dict.
|
# Create a human readable address to go in the dict.
|
||||||
# json_object["readable_address"] = self.get_readable_address(
|
json_object["readable_address"] = self.get_readable_address(
|
||||||
# [
|
[
|
||||||
# json_object["flat_appartment"],
|
json_object["flat_appartment"],
|
||||||
# json_object["builing"],
|
json_object["builing"],
|
||||||
# f'{json_object["number"]} {json_object["street"]}',
|
f'{json_object["number"]} {json_object["street"]}',
|
||||||
# json_object["postcode"],
|
json_object["postcode"],
|
||||||
# ],
|
],
|
||||||
# [
|
[
|
||||||
# json_object["locality"],
|
json_object["locality"],
|
||||||
# json_object["town"],
|
json_object["town"],
|
||||||
# json_object["district"],
|
json_object["district"],
|
||||||
# json_object["county"],
|
json_object["county"],
|
||||||
# ],
|
],
|
||||||
# )
|
)
|
||||||
yield json_object
|
yield json_object
|
||||||
|
|
||||||
|
|
||||||
@@ -276,13 +258,15 @@ def run(argv=None, save_main_session=True):
|
|||||||
pathlib.Path(__file__).parents[1]
|
pathlib.Path(__file__).parents[1]
|
||||||
/ "data"
|
/ "data"
|
||||||
/ "input"
|
/ "input"
|
||||||
/ "pp-monthly-update-new-version.csv"
|
/ "pp-2020.csv"
|
||||||
|
# / "pp-complete.csv"
|
||||||
)
|
)
|
||||||
output_file = (
|
output_file = (
|
||||||
pathlib.Path(__file__).parents[1]
|
pathlib.Path(__file__).parents[1]
|
||||||
/ "data"
|
/ "data"
|
||||||
/ "output"
|
/ "output"
|
||||||
/ "pp-monthly-update-new-version"
|
/ "pp-2020"
|
||||||
|
# / "pp-complete"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Arguments
|
# Arguments
|
||||||
@@ -322,33 +306,20 @@ def run(argv=None, save_main_session=True):
|
|||||||
| "Drop Empty Postcodes" >> beam.ParDo(DropRecordsSingleEmptyColumn(2))
|
| "Drop Empty Postcodes" >> beam.ParDo(DropRecordsSingleEmptyColumn(2))
|
||||||
| "Drop empty PAON if missing SAON"
|
| "Drop empty PAON if missing SAON"
|
||||||
>> beam.ParDo(DropRecordsTwoEmptyColumn(3, 4))
|
>> beam.ParDo(DropRecordsTwoEmptyColumn(3, 4))
|
||||||
# | beam.ParDo(DebugShowColumnWithValueIn(2, "B16 0AE"))
|
|
||||||
# | beam.ParDo(DebugShowColumnWithValueIn(2, "B90 3LA"))
|
|
||||||
| "Split PAON into two columns if separated by comma"
|
| "Split PAON into two columns if separated by comma"
|
||||||
>> beam.ParDo(SplitColumn(3, ","))
|
>> beam.ParDo(SplitColumn(3, ","))
|
||||||
)
|
)
|
||||||
|
|
||||||
# # Clean the data by creating an ID, and deduplicating to eliminate repeated rows.
|
|
||||||
# clean_deduplicate = (
|
|
||||||
# clean_drop
|
|
||||||
# | "Generate ID using all columns"
|
|
||||||
# >> beam.ParDo(GenerateUniqueID(all_columns=True))
|
|
||||||
# | "Group by the ID for all columns" >> beam.GroupByKey()
|
|
||||||
# | "Deduplicate by the ID for all columns" >> beam.ParDo(DeduplicateByID())
|
|
||||||
# )
|
|
||||||
|
|
||||||
# Create a mapping table
|
# Create a mapping table
|
||||||
mapping_table_raw = (
|
mapping_table_raw = (
|
||||||
clean_drop
|
clean_drop
|
||||||
| "Create a mapping table with key of id_all_columns and value of cleaned data."
|
| "Create a mapping table with key of id_all_columns and value of cleaned data."
|
||||||
>> beam.ParDo(CreateMappingTable(all_columns=True))
|
>> beam.ParDo(CreateMappingTable(all_columns=True))
|
||||||
# | beam.Map(print)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
mapping_table_condensed = (
|
mapping_table_condensed = (
|
||||||
mapping_table_raw
|
mapping_table_raw
|
||||||
| "Condense mapping table into single dict" >> beam.combiners.ToDict()
|
| "Condense mapping table into single dict" >> beam.combiners.ToDict()
|
||||||
# | beam.Map(print)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
prepared = (
|
prepared = (
|
||||||
@@ -363,26 +334,13 @@ def run(argv=None, save_main_session=True):
|
|||||||
>> beam.FlatMap(
|
>> beam.FlatMap(
|
||||||
insert_data_for_id, beam.pvalue.AsSingleton(mapping_table_condensed)
|
insert_data_for_id, beam.pvalue.AsSingleton(mapping_table_condensed)
|
||||||
)
|
)
|
||||||
# | beam.Map(print)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# # Prepare the data by generating an ID using the uniquely identifying
|
|
||||||
# # information only and grouping them by this ID.
|
|
||||||
# prepare = (
|
|
||||||
# clean_deduplicate
|
|
||||||
# | "Remove previous unique ID" >> beam.ParDo(RemoveUniqueID())
|
|
||||||
# | "Generate unique ID ignoring price & date old"
|
|
||||||
# >> beam.ParDo(GenerateUniqueID())
|
|
||||||
# | "Group by the ID ignoring price & date" >> beam.GroupByKey()
|
|
||||||
# # | beam.Map(print)
|
|
||||||
# )
|
|
||||||
|
|
||||||
# Format the data into a dict.
|
# Format the data into a dict.
|
||||||
formatted = (
|
formatted = (
|
||||||
prepared
|
prepared
|
||||||
| "Convert the prepared data into a dict object"
|
| "Convert the prepared data into a dict object"
|
||||||
>> beam.ParDo(ConvertDataToDict())
|
>> beam.ParDo(ConvertDataToDict())
|
||||||
# | beam.Map(print)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Save the data to a .json file.
|
# Save the data to a .json file.
|
||||||
|
|||||||
Reference in New Issue
Block a user