adding latest beam pipeline code for dataflow with group optimisation

2026-02-06 07:55:45 +00:00 · 2021-09-27 03:18:17 +01:00
parent 377e3c703f
commit 3a74579440
1 changed files with 65 additions and 107 deletions
--- a/analyse_properties/main.py
+++ b/analyse_properties/main.py
@@ -9,8 +9,6 @@ import pathlib
 import apache_beam as beam
 from apache_beam.options.pipeline_options import PipelineOptions, SetupOptions
 # from analyse_properties.debug import DebugShowEmptyColumn, DebugShowColumnWithValueIn
 def slice_by_range(element, *ranges):
    """
@@ -28,15 +26,11 @@ def slice_by_range(element, *ranges):
 class DropRecordsSingleEmptyColumn(beam.DoFn):
    def __init__(self, index):
        self.index = index
    def process(self, element):
    """
    Drop the entire row if a given column is empty.
    Args:
-            element : The element
+        index : The index of the column in the list.
    Returns:
        None: If the length of the column is 0, drop the element.
@@ -44,6 +38,11 @@ class DropRecordsSingleEmptyColumn(beam.DoFn):
    Yields:
        element: If the length of the column isn't 0, keep the element.
    """
    def __init__(self, index):
        self.index = index
    def process(self, element):
        column = element[self.index]
        if len(column) == 0:
            return None
@@ -51,7 +50,19 @@ class DropRecordsSingleEmptyColumn(beam.DoFn):
 class DropRecordsTwoEmptyColumn(beam.DoFn):
-    """If two given items in a list are both empty, drop this entry from the PCollection."""
+    """
    Drop the entire row if both of two given columns are empty.
    Args:
        index_0 : The index of the first column in the list.
        index_1 : The index of the second column in the list.
    Returns:
        None: If the length of both columns is 0, drop the element.
    Yields:
        element: If the length of both columns isn't 0, keep the element.
    """
    def __init__(self, index_0, index_1):
        self.index_0 = index_0
@@ -66,7 +77,14 @@ class DropRecordsTwoEmptyColumn(beam.DoFn):
 class SplitColumn(beam.DoFn):
-    """Split an item in a list into two separate items in the PCollection."""
+    """
    Split one column into two columns by a character.
    Args:
        index : The index of the column in the list.
        split_char: The character to split the column by.
    """
    def __init__(self, index, split_char):
        self.index = index
@@ -74,7 +92,7 @@ class SplitColumn(beam.DoFn):
    def process(self, element):
        # If there is a split based on the split_char, then keep the first result in
-        # place and append the second.
+        # place and append the second column at the end.
        try:
            part_0, part_1 = element[self.index].split(self.split_char)
            element[self.index] = part_1.strip()
@@ -87,8 +105,16 @@ class SplitColumn(beam.DoFn):
 class CreateMappingTable(beam.DoFn):
    """
-    Generate a unique ID for the PCollection, either for all the columns or for the
+    Create a mapping table to be used as a side-input.
-    uniquely identifying data only.
+
    This mapping table has a key of an id generated across all columns and a value of
    the raw property data.
    The table is used to populate the raw property data after a GroupByKey which is done on ids only
    in order to reduce the amount of data processed in the GroupByKey operation.
    Args:
        all_columns
    """
    def __init__(self, all_columns=False):
@@ -104,14 +130,6 @@ class CreateMappingTable(beam.DoFn):
        yield new_element
 # class CreateMappingTable(beam.DoFn):
 #     def process(self, element):
 #         unique_string = ",".join(element)
 #         hashed_string = hashlib.md5(unique_string.encode())
 #         new_element = {hashed_string.hexdigest(): list(element)}
 #         yield new_element
 class CreateUniquePropertyID(beam.DoFn):
    def process(self, element):
        unique_string = ",".join(element[-1][2:])
@@ -127,48 +145,12 @@ class DeduplicateIDs(beam.DoFn):
        yield new_element
 # class InsertDataForID(beam.DoFn):
 #     def __init__(self, mapping_table):
 #         self.mapping_table = mapping_table
 #     def process(self, element):
 #         replaced_list = [self.mapping_table[x] for x in element[-1]]
 #         new_element = (element[0], replaced_list)
 #         yield new_element
 def insert_data_for_id(element, mapping_table):
-    replaced_list = []
+    replaced_list = [mapping_table[data_id] for data_id in element[-1]]
    for data_id in element[-1]:
        replaced_list.append(mapping_table[data_id])
    # replaced_list = [mapping_table[x] for x in element[-1]]
    new_element = (element[0], replaced_list)
    yield new_element
 # old
 class DeduplicateByID(beam.DoFn):
    """
    If the PCollection has multiple entries after being grouped by ID for all columns,
    deduplicate the list to keep only one.
    """
    def process(self, element):
        if len(list(element[1])) > 0:
            deduplicated_element = (element[0], [list(element[1])[0]])
            yield deduplicated_element
        else:
            yield element
 class RemoveUniqueID(beam.DoFn):
    """Remove the unique ID from the PCollection, transforming it back into a list."""
    def process(self, element):
        element_no_id = element[-1][0]
        yield element_no_id
 class ConvertDataToDict(beam.DoFn):
    """Convert the processed data into a dict to be exported as a JSON object."""
@@ -232,7 +214,7 @@ class ConvertDataToDict(beam.DoFn):
        # Create the dict to hold all the information about the property.
        json_object = {
            "property_id": element[0],
-            # "readable_address": None,
+            "readable_address": None,
            "flat_appartment": list(element[-1])[0][4],
            "builing": list(element[-1])[0][10],
            "number": list(element[-1])[0][3],
@@ -252,20 +234,20 @@ class ConvertDataToDict(beam.DoFn):
        }
        # Create a human readable address to go in the dict.
-        # json_object["readable_address"] = self.get_readable_address(
+        json_object["readable_address"] = self.get_readable_address(
-        #     [
+            [
-        #         json_object["flat_appartment"],
+                json_object["flat_appartment"],
-        #         json_object["builing"],
+                json_object["builing"],
-        #         f'{json_object["number"]} {json_object["street"]}',
+                f'{json_object["number"]} {json_object["street"]}',
-        #         json_object["postcode"],
+                json_object["postcode"],
-        #     ],
+            ],
-        #     [
+            [
-        #         json_object["locality"],
+                json_object["locality"],
-        #         json_object["town"],
+                json_object["town"],
-        #         json_object["district"],
+                json_object["district"],
-        #         json_object["county"],
+                json_object["county"],
-        #     ],
+            ],
-        # )
+        )
        yield json_object
@@ -276,13 +258,15 @@ def run(argv=None, save_main_session=True):
        pathlib.Path(__file__).parents[1]
        / "data"
        / "input"
-        / "pp-monthly-update-new-version.csv"
+        / "pp-2020.csv"
        # / "pp-complete.csv"
    )
    output_file = (
        pathlib.Path(__file__).parents[1]
        / "data"
        / "output"
-        / "pp-monthly-update-new-version"
+        / "pp-2020"
        # / "pp-complete"
    )
    # Arguments
@@ -322,33 +306,20 @@ def run(argv=None, save_main_session=True):
            | "Drop Empty Postcodes" >> beam.ParDo(DropRecordsSingleEmptyColumn(2))
            | "Drop empty PAON if missing SAON"
            >> beam.ParDo(DropRecordsTwoEmptyColumn(3, 4))
            # | beam.ParDo(DebugShowColumnWithValueIn(2, "B16 0AE"))
            # | beam.ParDo(DebugShowColumnWithValueIn(2, "B90 3LA"))
            | "Split PAON into two columns if separated by comma"
            >> beam.ParDo(SplitColumn(3, ","))
        )
        # # Clean the data by creating an ID, and deduplicating to eliminate repeated rows.
        # clean_deduplicate = (
        #     clean_drop
        #     | "Generate ID using all columns"
        #     >> beam.ParDo(GenerateUniqueID(all_columns=True))
        #     | "Group by the ID for all columns" >> beam.GroupByKey()
        #     | "Deduplicate by the ID for all columns" >> beam.ParDo(DeduplicateByID())
        # )
        # Create a mapping table
        mapping_table_raw = (
            clean_drop
            | "Create a mapping table with key of id_all_columns and value of cleaned data."
            >> beam.ParDo(CreateMappingTable(all_columns=True))
            # | beam.Map(print)
        )
        mapping_table_condensed = (
            mapping_table_raw
            | "Condense mapping table into single dict" >> beam.combiners.ToDict()
            # | beam.Map(print)
        )
        prepared = (
@@ -363,26 +334,13 @@ def run(argv=None, save_main_session=True):
            >> beam.FlatMap(
                insert_data_for_id, beam.pvalue.AsSingleton(mapping_table_condensed)
            )
            # | beam.Map(print)
        )
        # # Prepare the data by generating an ID using the uniquely identifying
        # # information only and grouping them by this ID.
        # prepare = (
        #     clean_deduplicate
        #     | "Remove previous unique ID" >> beam.ParDo(RemoveUniqueID())
        #     | "Generate unique ID ignoring price & date old"
        #     >> beam.ParDo(GenerateUniqueID())
        #     | "Group by the ID ignoring price & date" >> beam.GroupByKey()
        #     # | beam.Map(print)
        # )
        # Format the data into a dict.
        formatted = (
            prepared
            | "Convert the prepared data into a dict object"
            >> beam.ParDo(ConvertDataToDict())
            # | beam.Map(print)
        )
        # Save the data to a .json file.