From 7f9b7e4bfd2e40633da4534aa67877cd559b9499 Mon Sep 17 00:00:00 2001 From: Daniel Tomlinson Date: Sun, 26 Sep 2021 06:03:55 +0100 Subject: [PATCH 1/5] moving inputs/outputs to use pathlib --- analyse_properties/data/__init__.py | 0 analyse_properties/data/input/__init__.py | 0 analyse_properties/data/output/__init__.py | 0 analyse_properties/main.py | 37 +++++++++------------- download_data.sh | 4 +-- 5 files changed, 17 insertions(+), 24 deletions(-) delete mode 100644 analyse_properties/data/__init__.py delete mode 100644 analyse_properties/data/input/__init__.py delete mode 100644 analyse_properties/data/output/__init__.py diff --git a/analyse_properties/data/__init__.py b/analyse_properties/data/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/analyse_properties/data/input/__init__.py b/analyse_properties/data/input/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/analyse_properties/data/output/__init__.py b/analyse_properties/data/output/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/analyse_properties/main.py b/analyse_properties/main.py index 95b8299..cf43f89 100644 --- a/analyse_properties/main.py +++ b/analyse_properties/main.py @@ -1,22 +1,13 @@ -import csv from datetime import datetime import hashlib -import io -from importlib import resources import itertools import pathlib import apache_beam as beam -from apache_beam.io import fileio # from analyse_properties.debug import DebugShowEmptyColumn, DebugShowColumnWithValueIn -def csv_reader(csv_file): - """Read in a csv file.""" - return csv.reader(io.TextIOWrapper(csv_file.open())) - - def slice_by_range(element, *ranges): """Slice a list with multiple ranges.""" return itertools.chain(*(itertools.islice(element, *r) for r in ranges)) @@ -214,22 +205,22 @@ class ConvertDataToDict(beam.DoFn): def main(): # Load in the data from a csv file. - csv_data = resources.path( - # "analyse_properties.data.input", - # "pp-monthly-update-new-version.csv" - "analyse_properties.data.input", "pp-complete.csv" + input_file = ( + pathlib.Path(__file__).parents[1] + / "data" + / "input" + / "pp-monthly-update-new-version.csv" ) with beam.Pipeline() as pipeline: # Load the data - with csv_data as csv_data_file: - # https://github.com/apache/beam/blob/v2.32.0/sdks/python/apache_beam/io/fileio_test.py#L155-L170 - load = ( - pipeline - | fileio.MatchFiles(str(csv_data_file)) - | fileio.ReadMatches() - | beam.FlatMap(csv_reader) - ) + load = ( + pipeline + | "Read input data" >> beam.io.ReadFromText(str(input_file)) + | "Split by ','" >> beam.Map(lambda element: element.split(",")) + | "Remove leading and trailing quotes" + >> beam.Map(lambda element: [el.strip('"') for el in element]) + ) # Clean the data by dropping unneeded rows. clean_drop = ( @@ -276,7 +267,9 @@ def main(): ) # Save the data to a .json file. - output_file = pathlib.Path(__file__).parent / "data" / "output" / "pp-complete" + output_file = ( + pathlib.Path(__file__).parents[1] / "data" / "output" / "pp-complete" + ) output = ( formatted | "Combine into one PCollection" >> beam.combiners.ToList() diff --git a/download_data.sh b/download_data.sh index d69c6e4..dc0fa7a 100755 --- a/download_data.sh +++ b/download_data.sh @@ -1,5 +1,5 @@ # Full data set -wget https://storage.googleapis.com/street-group-technical-test-dmot/pp-complete.csv -P analyse_properties/data/input +# wget https://storage.googleapis.com/street-group-technical-test-dmot/pp-complete.csv -P data/input # Monthly update data set -# wget https://storage.googleapis.com/street-group-technical-test-dmot/pp-monthly-update-new-version.csv -P analyse_properties/data/input +wget https://storage.googleapis.com/street-group-technical-test-dmot/pp-monthly-update-new-version.csv -P data/input From 62bd0196adf3a7b5cf29926fea8273ab70d050ac Mon Sep 17 00:00:00 2001 From: Daniel Tomlinson Date: Sun, 26 Sep 2021 06:11:42 +0100 Subject: [PATCH 2/5] removing shard_name_template from output file --- analyse_properties/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/analyse_properties/main.py b/analyse_properties/main.py index cf43f89..0053233 100644 --- a/analyse_properties/main.py +++ b/analyse_properties/main.py @@ -277,7 +277,6 @@ def main(): >> beam.io.WriteToText( file_path_prefix=str(output_file), file_name_suffix=".json", - shard_name_template="", ) ) From c4e81065b1ea57e126ef6d40041246cca2f84148 Mon Sep 17 00:00:00 2001 From: Daniel Tomlinson Date: Sun, 26 Sep 2021 14:55:05 +0100 Subject: [PATCH 3/5] adding pyenv 3.7.9 --- .python-version | 1 + 1 file changed, 1 insertion(+) create mode 100644 .python-version diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..c77a7de --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.7.9 From b8a997084d767c09dc050213086f02326140754c Mon Sep 17 00:00:00 2001 From: Daniel Tomlinson Date: Sun, 26 Sep 2021 14:55:48 +0100 Subject: [PATCH 4/5] removing requirements.txt for documentation --- requirements-docs.txt | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 requirements-docs.txt diff --git a/requirements-docs.txt b/requirements-docs.txt deleted file mode 100644 index 5f99f40..0000000 --- a/requirements-docs.txt +++ /dev/null @@ -1,6 +0,0 @@ -apache-beam==2.32.0; python_version >= "3.6" -avro-python3==1.9.2.1; python_version >= "3.6" -cachetools==4.2.2; python_version >= "3.6" and python_version < "4.0" and (python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.6.0") -certifi==2021.5.30; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version >= "3.6" -mkdocs-material==7.3.0 -mkdocs==1.2.2; python_version >= "3.6" From 7f874fa6f6db8b25caadda738685fa78508817f5 Mon Sep 17 00:00:00 2001 From: Daniel Tomlinson Date: Sun, 26 Sep 2021 14:56:01 +0100 Subject: [PATCH 5/5] updating README.md --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1f0f4ae..c308a39 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,9 @@ # street_group_tech_test -Technical Test for Street Group + +Technical Test for Street Group for Daniel Tomlinson. + +## Documentation + +Read the documentation on github pages for instructions around running the code and a discussion on the approach. + +https://dtomlinson91.github.io/street_group_tech_test/