mirror of
https://github.com/dtomlinson91/street_group_tech_test
synced 2025-12-22 20:05:45 +00:00
Compare commits
41 Commits
wip/spark_
...
develop
| Author | SHA1 | Date | |
|---|---|---|---|
| e172b704a7 | |||
| 8a0d8085a2 | |||
| c481c1a976 | |||
| 577aa9e388 | |||
| 4d3e5fbc23 | |||
| a53d79118a | |||
| 4561f1a356 | |||
| 4056ca1f32 | |||
| cfdee9d3ed | |||
| cbb8a7e237 | |||
| a73d7b74a4 | |||
| 76434fae5b | |||
| 886a37ca94 | |||
| 3263b3dd8b | |||
| dffc6aa553 | |||
| f9eeb8bfad | |||
| cad6612ebe | |||
| 391861d80c | |||
| f60beb4565 | |||
| f2ed60426d | |||
| 7db1edb90c | |||
| 3a74579440 | |||
| 377e3c703f | |||
| a8fc06c764 | |||
| eaa36877f6 | |||
| 1941fcb7bf | |||
| 99e67c2840 | |||
| 8e8469579e | |||
| 4e3771c728 | |||
| 8856a9763f | |||
| fded858932 | |||
| bb71d55f8c | |||
| 8047b5ced4 | |||
| 9f53c66975 | |||
| e6ec110d54 | |||
| 83807616e0 | |||
| 7f874fa6f6 | |||
| b8a997084d | |||
| c4e81065b1 | |||
| 62bd0196ad | |||
| 7f9b7e4bfd |
1
.python-version
Normal file
1
.python-version
Normal file
@@ -0,0 +1 @@
|
|||||||
|
3.7.9
|
||||||
@@ -1,2 +1,9 @@
|
|||||||
# street_group_tech_test
|
# street_group_tech_test
|
||||||
Technical Test for Street Group
|
|
||||||
|
Technical Test for Street Group for Daniel Tomlinson.
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
|
||||||
|
Read the documentation on github pages for instructions around running the code and a discussion on the approach.
|
||||||
|
|
||||||
|
https://dtomlinson91.github.io/street_group_tech_test/
|
||||||
|
|||||||
@@ -22,3 +22,9 @@ class DebugShowColumnWithValueIn(beam.DoFn):
|
|||||||
if self.value in column:
|
if self.value in column:
|
||||||
yield element
|
yield element
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class DebugPrint(beam.DoFn):
|
||||||
|
def process(self, element):
|
||||||
|
print(element)
|
||||||
|
yield element
|
||||||
|
|||||||
@@ -1,29 +1,44 @@
|
|||||||
import csv
|
import argparse
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import hashlib
|
import hashlib
|
||||||
import io
|
|
||||||
from importlib import resources
|
|
||||||
import itertools
|
import itertools
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
import pathlib
|
import pathlib
|
||||||
|
|
||||||
import apache_beam as beam
|
import apache_beam as beam
|
||||||
from apache_beam.io import fileio
|
from apache_beam.options.pipeline_options import PipelineOptions, SetupOptions
|
||||||
|
|
||||||
# from analyse_properties.debug import DebugShowEmptyColumn, DebugShowColumnWithValueIn
|
|
||||||
|
|
||||||
|
|
||||||
def csv_reader(csv_file):
|
|
||||||
"""Read in a csv file."""
|
|
||||||
return csv.reader(io.TextIOWrapper(csv_file.open()))
|
|
||||||
|
|
||||||
|
# from analyse_properties.debug import * # noqa
|
||||||
|
|
||||||
def slice_by_range(element, *ranges):
|
def slice_by_range(element, *ranges):
|
||||||
"""Slice a list with multiple ranges."""
|
"""
|
||||||
|
Slice a list with multiple ranges.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
element : The element.
|
||||||
|
*ranges (tuple): Tuples containing a start,end index to slice the element.
|
||||||
|
E.g (0, 3), (5, 6) - Keeps columns 0,1,2,5. Drops everything else.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: The list sliced by the ranges
|
||||||
|
"""
|
||||||
return itertools.chain(*(itertools.islice(element, *r) for r in ranges))
|
return itertools.chain(*(itertools.islice(element, *r) for r in ranges))
|
||||||
|
|
||||||
|
|
||||||
class DropRecordsSingleEmptyColumn(beam.DoFn):
|
class DropRecordsSingleEmptyColumn(beam.DoFn):
|
||||||
"""If a given item in a list is empty, drop this entry from the PCollection."""
|
"""
|
||||||
|
Drop the entire row if a given column is empty.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
index : The index of the column in the list.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None: If the length of the column is 0, drop the element.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
element: If the length of the column is >0, keep the element.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, index):
|
def __init__(self, index):
|
||||||
self.index = index
|
self.index = index
|
||||||
@@ -36,7 +51,19 @@ class DropRecordsSingleEmptyColumn(beam.DoFn):
|
|||||||
|
|
||||||
|
|
||||||
class DropRecordsTwoEmptyColumn(beam.DoFn):
|
class DropRecordsTwoEmptyColumn(beam.DoFn):
|
||||||
"""If two given items in a list are both empty, drop this entry from the PCollection."""
|
"""
|
||||||
|
Drop the entire row if both of two given columns are empty.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
index_0 : The index of the first column in the list.
|
||||||
|
index_1 : The index of the second column in the list.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None: If the length of both columns is 0, drop the element.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
element: If the length of both columns is >0, keep the element.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, index_0, index_1):
|
def __init__(self, index_0, index_1):
|
||||||
self.index_0 = index_0
|
self.index_0 = index_0
|
||||||
@@ -51,65 +78,91 @@ class DropRecordsTwoEmptyColumn(beam.DoFn):
|
|||||||
|
|
||||||
|
|
||||||
class SplitColumn(beam.DoFn):
|
class SplitColumn(beam.DoFn):
|
||||||
"""Split an item in a list into two separate items in the PCollection."""
|
"""
|
||||||
|
Split one column into two columns by a character.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
index : The index of the column in the list.
|
||||||
|
split_char: The character to split the column by.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, index, split_char):
|
def __init__(self, index, split_char):
|
||||||
self.index = index
|
self.index = index
|
||||||
self.split_char = split_char
|
self.split_char = split_char
|
||||||
|
|
||||||
def process(self, element):
|
def process(self, element):
|
||||||
# If there is a split based on the split_char, then keep the first result in
|
# If there is a split based on the split_char, then keep the second result in
|
||||||
# place and append the second.
|
# place (street number) and append the first result (building) at the end.
|
||||||
try:
|
try:
|
||||||
part_0, part_1 = element[self.index].split(self.split_char)
|
part_0, part_1 = element[self.index].split(self.split_char)
|
||||||
element[self.index] = part_1.strip()
|
element[self.index] = part_1.strip()
|
||||||
element.append(part_0.strip())
|
element.append(part_0.strip())
|
||||||
yield element
|
yield element
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
# append a blank column to keep column numbers consistent.
|
||||||
element.append("")
|
element.append("")
|
||||||
yield element
|
yield element
|
||||||
|
|
||||||
|
|
||||||
class GenerateUniqueID(beam.DoFn):
|
class CreateMappingTable(beam.DoFn):
|
||||||
"""
|
|
||||||
Generate a unique ID for the PCollection, either for all the columns or for the
|
|
||||||
uniquely identifying data only.
|
|
||||||
"""
|
"""
|
||||||
|
Create a mapping table to be used as a side-input.
|
||||||
|
|
||||||
def __init__(self, all_columns=False):
|
This mapping table has a key of an ID generated across all columns and a value of
|
||||||
self.all_columns = all_columns
|
the raw property data.
|
||||||
|
|
||||||
|
The table is used to populate the raw property data after a GroupByKey using
|
||||||
|
only the IDs in order to reduce the amount of data processed in the GroupByKey operation.
|
||||||
|
"""
|
||||||
|
|
||||||
def process(self, element):
|
def process(self, element):
|
||||||
unique_string = (
|
# Join the row into a string.
|
||||||
",".join(element[2:]) if not self.all_columns else ",".join(element)
|
unique_string = ",".join(element)
|
||||||
)
|
# Hash the string.
|
||||||
hashed_string = hashlib.md5(unique_string.encode())
|
hashed_string = hashlib.md5(unique_string.encode())
|
||||||
# append the hash to the end
|
# Format the resulting PCollection with the key of id and value of raw data.
|
||||||
element.append(hashed_string.hexdigest())
|
new_element = (hashed_string.hexdigest(), list(element))
|
||||||
yield element
|
yield new_element
|
||||||
|
|
||||||
|
|
||||||
class DeduplicateByID(beam.DoFn):
|
class CreateUniquePropertyID(beam.DoFn):
|
||||||
"""
|
"""
|
||||||
If the PCollection has multiple entries after being grouped by ID for all columns,
|
Create a unique property ID which does not include the price and date of sale.
|
||||||
deduplicate the list to keep only one.
|
|
||||||
|
Uses each row of the mapping table to create a PCollection with a key of the
|
||||||
|
unique property ID and a value of the ID generated across all columns.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def process(self, element):
|
def process(self, element):
|
||||||
if len(element[1]) > 0:
|
unique_string = ",".join(element[-1][2:])
|
||||||
deduplicated_element = (element[0], [element[1][0]])
|
hashed_string = hashlib.md5(unique_string.encode())
|
||||||
yield deduplicated_element
|
new_element = (hashed_string.hexdigest(), element[0])
|
||||||
else:
|
yield new_element
|
||||||
yield element
|
|
||||||
|
|
||||||
|
|
||||||
class RemoveUniqueID(beam.DoFn):
|
class DeduplicateIDs(beam.DoFn):
|
||||||
"""Remove the unique ID from the PCollection, transforming it back into a list."""
|
"""Deduplicate a list of IDs."""
|
||||||
|
|
||||||
def process(self, element):
|
def process(self, element):
|
||||||
element_no_id = element[-1][0]
|
deduplicated_list = list(set(element[-1]))
|
||||||
element_no_id.pop(-1)
|
new_element = (element[0], deduplicated_list)
|
||||||
yield element_no_id
|
yield new_element
|
||||||
|
|
||||||
|
|
||||||
|
def insert_data_for_id(element, mapping_table):
|
||||||
|
"""
|
||||||
|
Replace the ID with the raw data from the mapping table.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
element: The element.
|
||||||
|
mapping_table (dict): The mapping table.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
The element with IDs replaced with raw data.
|
||||||
|
"""
|
||||||
|
replaced_list = [mapping_table[data_id] for data_id in element[-1]]
|
||||||
|
new_element = (element[0], replaced_list)
|
||||||
|
yield new_element
|
||||||
|
|
||||||
|
|
||||||
class ConvertDataToDict(beam.DoFn):
|
class ConvertDataToDict(beam.DoFn):
|
||||||
@@ -117,7 +170,15 @@ class ConvertDataToDict(beam.DoFn):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_latest_transaction(transaction_dates):
|
def get_latest_transaction(transaction_dates):
|
||||||
"""Get the date of the latest transaction."""
|
"""
|
||||||
|
Get the date of the latest transaction for a list of dates.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
transaction_dates (str): A date in the form "%Y-%m-%d".
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The year in the form "%Y" of the latest transaction date.
|
||||||
|
"""
|
||||||
transaction_dates = [
|
transaction_dates = [
|
||||||
datetime.strptime(individual_transaction, "%Y-%m-%d")
|
datetime.strptime(individual_transaction, "%Y-%m-%d")
|
||||||
for individual_transaction in transaction_dates
|
for individual_transaction in transaction_dates
|
||||||
@@ -125,7 +186,17 @@ class ConvertDataToDict(beam.DoFn):
|
|||||||
return max(transaction_dates).strftime("%Y")
|
return max(transaction_dates).strftime("%Y")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_readable_address(address_components: list, address_comparisons: list):
|
def get_readable_address(address_components, address_comparisons):
|
||||||
|
"""
|
||||||
|
Create a human readable address from the locality/town/district/county columns.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
address_components (list): The preceeding parts of the address (street, postcode etc.)
|
||||||
|
address_comparisons (list): The locality/town/district/county.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The complete address deduplicated & cleaned.
|
||||||
|
"""
|
||||||
# Get pairwise comparison to see if two locality/town/district/counties
|
# Get pairwise comparison to see if two locality/town/district/counties
|
||||||
# are equivalent
|
# are equivalent
|
||||||
pairwise_comparison = [
|
pairwise_comparison = [
|
||||||
@@ -146,7 +217,6 @@ class ConvertDataToDict(beam.DoFn):
|
|||||||
applied_mask = list(itertools.compress(address_comparisons, mask))
|
applied_mask = list(itertools.compress(address_comparisons, mask))
|
||||||
# Filter out empty items in list
|
# Filter out empty items in list
|
||||||
deduplicated_address_part = list(filter(None, applied_mask))
|
deduplicated_address_part = list(filter(None, applied_mask))
|
||||||
|
|
||||||
# Filter out any missing parts of the address components
|
# Filter out any missing parts of the address components
|
||||||
cleaned_address_components = list(filter(None, address_components))
|
cleaned_address_components = list(filter(None, address_components))
|
||||||
|
|
||||||
@@ -165,9 +235,9 @@ class ConvertDataToDict(beam.DoFn):
|
|||||||
# Group together all the transactions for the property.
|
# Group together all the transactions for the property.
|
||||||
property_transactions = [
|
property_transactions = [
|
||||||
{
|
{
|
||||||
"price": entry[0],
|
"price": int(entry[0]),
|
||||||
"transaction_date": entry[1].replace(" 00:00", ""),
|
"transaction_date": entry[1].replace(" 00:00", ""),
|
||||||
"year": entry[1][0:4],
|
"year": int(entry[1][0:4]),
|
||||||
}
|
}
|
||||||
for entry in element[-1]
|
for entry in element[-1]
|
||||||
]
|
]
|
||||||
@@ -176,22 +246,22 @@ class ConvertDataToDict(beam.DoFn):
|
|||||||
json_object = {
|
json_object = {
|
||||||
"property_id": element[0],
|
"property_id": element[0],
|
||||||
"readable_address": None,
|
"readable_address": None,
|
||||||
"flat_appartment": element[-1][0][4],
|
"flat_appartment": list(element[-1])[0][4],
|
||||||
"builing": element[-1][0][10],
|
"builing": list(element[-1])[0][10],
|
||||||
"number": element[-1][0][3],
|
"number": list(element[-1])[0][3],
|
||||||
"street": element[-1][0][5],
|
"street": list(element[-1])[0][5],
|
||||||
"locality": element[-1][0][6],
|
"locality": list(element[-1])[0][6],
|
||||||
"town": element[-1][0][7],
|
"town": list(element[-1])[0][7],
|
||||||
"district": element[-1][0][8],
|
"district": list(element[-1])[0][8],
|
||||||
"county": element[-1][0][9],
|
"county": list(element[-1])[0][9],
|
||||||
"postcode": element[-1][0][2],
|
"postcode": list(element[-1])[0][2],
|
||||||
"property_transactions": property_transactions,
|
"property_transactions": property_transactions,
|
||||||
"latest_transaction_year": self.get_latest_transaction(
|
"latest_transaction_year": int(self.get_latest_transaction(
|
||||||
[
|
[
|
||||||
transaction["transaction_date"]
|
transaction["transaction_date"]
|
||||||
for transaction in property_transactions
|
for transaction in property_transactions
|
||||||
]
|
]
|
||||||
),
|
)),
|
||||||
}
|
}
|
||||||
|
|
||||||
# Create a human readable address to go in the dict.
|
# Create a human readable address to go in the dict.
|
||||||
@@ -212,26 +282,49 @@ class ConvertDataToDict(beam.DoFn):
|
|||||||
yield json_object
|
yield json_object
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def run(argv=None, save_main_session=True):
|
||||||
# Load in the data from a csv file.
|
"""Entrypoint and definition of the pipeline."""
|
||||||
csv_data = resources.path(
|
logging.getLogger().setLevel(logging.INFO)
|
||||||
# "analyse_properties.data.input",
|
|
||||||
# "pp-monthly-update-new-version.csv"
|
# Default input/output files when ran from base of repo with files in ./data
|
||||||
"analyse_properties.data.input", "pp-complete.csv"
|
input_file = (
|
||||||
|
pathlib.Path("./data/input/pp-2020.csv")
|
||||||
|
)
|
||||||
|
output_file = (
|
||||||
|
pathlib.Path("./data/output/pp-2020")
|
||||||
)
|
)
|
||||||
|
|
||||||
with beam.Pipeline() as pipeline:
|
# Arguments
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--input",
|
||||||
|
dest="input",
|
||||||
|
default=str(input_file),
|
||||||
|
help="Full path to the input file.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
dest="output",
|
||||||
|
default=str(output_file),
|
||||||
|
help="Full path to the output file without extension.",
|
||||||
|
)
|
||||||
|
known_args, pipeline_args = parser.parse_known_args(argv)
|
||||||
|
|
||||||
|
# Pipeline options. save_main_session needed for DataFlow for global imports.
|
||||||
|
pipeline_options = PipelineOptions(pipeline_args)
|
||||||
|
pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
|
||||||
|
|
||||||
|
with beam.Pipeline(options=pipeline_options) as pipeline:
|
||||||
# Load the data
|
# Load the data
|
||||||
with csv_data as csv_data_file:
|
|
||||||
# https://github.com/apache/beam/blob/v2.32.0/sdks/python/apache_beam/io/fileio_test.py#L155-L170
|
|
||||||
load = (
|
load = (
|
||||||
pipeline
|
pipeline
|
||||||
| fileio.MatchFiles(str(csv_data_file))
|
| "Read input data" >> beam.io.ReadFromText(known_args.input)
|
||||||
| fileio.ReadMatches()
|
| "Split by ','" >> beam.Map(lambda element: element.split(","))
|
||||||
| beam.FlatMap(csv_reader)
|
| "Remove leading and trailing quotes"
|
||||||
|
>> beam.Map(lambda element: [el.strip('"') for el in element])
|
||||||
)
|
)
|
||||||
|
|
||||||
# Clean the data by dropping unneeded rows.
|
# Clean the data.
|
||||||
clean_drop = (
|
clean_drop = (
|
||||||
load
|
load
|
||||||
| "Drop unneeded columns"
|
| "Drop unneeded columns"
|
||||||
@@ -247,47 +340,55 @@ def main():
|
|||||||
>> beam.ParDo(SplitColumn(3, ","))
|
>> beam.ParDo(SplitColumn(3, ","))
|
||||||
)
|
)
|
||||||
|
|
||||||
# Clean the data by creating an ID, and deduplicating to eliminate repeated rows.
|
# Create a mapping table
|
||||||
clean_deduplicate = (
|
mapping_table_raw = (
|
||||||
clean_drop
|
clean_drop
|
||||||
| "Generate unique ID for all columns"
|
| "Create a mapping table with key of id_all_columns and value of cleaned data."
|
||||||
>> beam.ParDo(GenerateUniqueID(all_columns=True))
|
>> beam.ParDo(CreateMappingTable())
|
||||||
| "Group by the ID for all columns"
|
|
||||||
>> beam.GroupBy(lambda element: element[-1])
|
|
||||||
| "Deduplicate by the ID for all columns" >> beam.ParDo(DeduplicateByID())
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Prepare the data by generating an ID using the uniquely identifying information only
|
# Condense mapping table into a single dict.
|
||||||
# and grouping them by this ID.
|
mapping_table_condensed = (
|
||||||
prepare = (
|
mapping_table_raw
|
||||||
clean_deduplicate
|
| "Condense mapping table into single dict" >> beam.combiners.ToDict()
|
||||||
| "Remove previous unique ID" >> beam.ParDo(RemoveUniqueID())
|
)
|
||||||
| "Generate unique ID ignoring price & date"
|
|
||||||
>> beam.ParDo(GenerateUniqueID())
|
# Prepare the data by creating IDs, grouping together and using mapping table
|
||||||
| "Group by the ID ignoring price & date"
|
# to reinsert raw data.
|
||||||
>> beam.GroupBy(lambda element: element[-1])
|
prepared = (
|
||||||
|
mapping_table_raw
|
||||||
|
| "Create unique ID ignoring price & date"
|
||||||
|
>> beam.ParDo(CreateUniquePropertyID())
|
||||||
|
| "Group by ID"
|
||||||
|
>> beam.GroupByKey()
|
||||||
|
| "Deduplicate to eliminate repeated transactions"
|
||||||
|
>> beam.ParDo(DeduplicateIDs())
|
||||||
|
| "Insert the raw data using the mapping table"
|
||||||
|
>> beam.FlatMap(
|
||||||
|
insert_data_for_id, beam.pvalue.AsSingleton(mapping_table_condensed)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Format the data into a dict.
|
# Format the data into a dict.
|
||||||
formatted = (
|
formatted = (
|
||||||
prepare
|
prepared
|
||||||
| "Convert the prepared data into a dict object"
|
| "Convert the prepared data into a dict object"
|
||||||
>> beam.ParDo(ConvertDataToDict())
|
>> beam.ParDo(ConvertDataToDict())
|
||||||
)
|
)
|
||||||
|
|
||||||
# Save the data to a .json file.
|
# Save the data to a .json file.
|
||||||
output_file = pathlib.Path(__file__).parent / "data" / "output" / "pp-complete"
|
(
|
||||||
output = (
|
|
||||||
formatted
|
formatted
|
||||||
| "Combine into one PCollection" >> beam.combiners.ToList()
|
| "Combine into one PCollection" >> beam.combiners.ToList()
|
||||||
|
| "Format output" >> beam.Map(json.dumps, indent=2)
|
||||||
| "Save to .json file"
|
| "Save to .json file"
|
||||||
>> beam.io.WriteToText(
|
>> beam.io.WriteToText(
|
||||||
file_path_prefix=str(output_file),
|
file_path_prefix=known_args.output,
|
||||||
file_name_suffix=".json",
|
file_name_suffix=".json",
|
||||||
shard_name_template="",
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
logging.getLogger().setLevel(logging.INFO)
|
||||||
|
run()
|
||||||
|
|||||||
@@ -0,0 +1,3 @@
|
|||||||
|
[ZoneTransfer]
|
||||||
|
ZoneId=3
|
||||||
|
HostUrl=about:internet
|
||||||
BIN
docs/dataflow/img/successful_dataflow_job.png
Normal file
BIN
docs/dataflow/img/successful_dataflow_job.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 374 KiB |
47
docs/dataflow/index.md
Normal file
47
docs/dataflow/index.md
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
# Running on DataFlow
|
||||||
|
|
||||||
|
The pipeline runs as is on GCP DataFlow. The following documents how I deployed to my personal GCP account but the approach may vary depending on project/account in GCP.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
### Cloud Storage
|
||||||
|
|
||||||
|
- A Cloud Storage bucket with the following structure:
|
||||||
|
|
||||||
|
```
|
||||||
|
./input
|
||||||
|
./output
|
||||||
|
./tmp
|
||||||
|
```
|
||||||
|
|
||||||
|
- Place the input files into the `./input` directory in the bucket.
|
||||||
|
|
||||||
|
### VPC
|
||||||
|
|
||||||
|
To get around public IP quotas I created a VPC in the `europe-west1` region that has `Private Google Access` turned to `ON`.
|
||||||
|
|
||||||
|
## Command
|
||||||
|
|
||||||
|
!!! tip
|
||||||
|
We need to choose a `worker_machine_type` with sufficient memory to run the pipeline. As the pipeline uses a mapping table, and DataFlow autoscales on CPU and not memory usage, we need a machine with more ram than usual to ensure sufficient memory when running on one worker. For `pp-2020.csv` the type `n1-highmem-2` with 2vCPU and 13GB of ram was chosen and completed successfully in ~10 minutes using only 1 worker.
|
||||||
|
|
||||||
|
Assuming the `pp-2020.csv` file has been placed in the `./input` directory in the bucket you can run a command similar to:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m analyse_properties.main \
|
||||||
|
--runner DataflowRunner \
|
||||||
|
--project street-group \
|
||||||
|
--region europe-west1 \
|
||||||
|
--input gs://street-group-technical-test-dmot-euw1/input/pp-2020.csv \
|
||||||
|
--output gs://street-group-technical-test-dmot-euw1/output/pp-2020 \
|
||||||
|
--temp_location gs://street-group-technical-test-dmot-euw1/tmp \
|
||||||
|
--subnetwork=https://www.googleapis.com/compute/v1/projects/street-group/regions/europe-west1/subnetworks/europe-west-1-dataflow \
|
||||||
|
--no_use_public_ips \
|
||||||
|
--worker_machine_type=n1-highmem-2
|
||||||
|
```
|
||||||
|
|
||||||
|
The output file from this pipeline is publicly available and can be downloaded [here](https://storage.googleapis.com/street-group-technical-test-dmot-euw1/output/pp-2020-00000-of-00001.json).
|
||||||
|
|
||||||
|
The job graph for this pipeline is displayed below:
|
||||||
|
|
||||||
|

|
||||||
70
docs/dataflow/scaling.md
Normal file
70
docs/dataflow/scaling.md
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
# Scaling to the full DataSet
|
||||||
|
|
||||||
|
As is the pipeline will not run against the full dataset. But with a little work done to the existing pipeline I believe it is possible to work against the full dataset of ~27 million rows.
|
||||||
|
|
||||||
|
## Mapping table
|
||||||
|
|
||||||
|
Using a mapping table as a side-input means that for the full dataset this table is going to be huge.
|
||||||
|
|
||||||
|
Side inputs are stored in memory on the workers, with such a huge table the machines are going to quickly run out of available memory when autoscaling is applied.
|
||||||
|
|
||||||
|
Running the pipeline against the full dataset resulted in the following error:
|
||||||
|
|
||||||
|
```text
|
||||||
|
"Out of memory: Killed process 2042 (python) total-vm:28616496kB, anon-rss:25684136kB, file-rss:0kB, shmem-rss:0kB, UID:0 pgtables:51284kB oom_score_adj:900"
|
||||||
|
```
|
||||||
|
|
||||||
|
with the pipeline job failing to process anything and the rows being processed per/sec gradually falling to zero as the workers killed the Python process to try free up more memory. This resulted in autoscaling down (as the CPU decreased) and the entire pipeline stagnated.
|
||||||
|
|
||||||
|
Using a higher tiered `worker_machine_type`, disabling autoscaling, and fixing the workers to the maximum number of vCPUs available to the quota results in pipeline options:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
--worker_machine_type=n1-highmem-8 \
|
||||||
|
--num_workers=3 \
|
||||||
|
--autoscaling_algorithm=NONE
|
||||||
|
```
|
||||||
|
|
||||||
|
with 156GB of RAM available to the pipeline with 52GB on each worker.
|
||||||
|
|
||||||
|
The pipeline was able to progress further until Python threw an error and the pipeline failed and shut down:
|
||||||
|
|
||||||
|
```text
|
||||||
|
"Error message from worker: Traceback (most recent call last):
|
||||||
|
File "/usr/local/lib/python3.7/site-packages/dataflow_worker/batchworker.py", line 651, in do_work
|
||||||
|
work_executor.execute()
|
||||||
|
...
|
||||||
|
File "/usr/local/lib/python3.7/multiprocessing/connection.py", line 393, in _send_bytes
|
||||||
|
header = struct.pack("!i", n)
|
||||||
|
struct.error: 'i' format requires -2147483648 <= number <= 2147483647
|
||||||
|
```
|
||||||
|
|
||||||
|
The number 2147483647 being the maximum value for a 32bit integer.
|
||||||
|
|
||||||
|
As the side-input needs to be pickled (or serialised), this tells us that the table is far too large to be pickled and passed to the other workers. No amount of CPU/Memory can fix the problem.
|
||||||
|
|
||||||
|
## Patterns
|
||||||
|
|
||||||
|
Google have several patterns for large side-inputs which are documented here:
|
||||||
|
|
||||||
|
- Part 1 <https://cloud.google.com/blog/products/data-analytics/guide-to-common-cloud-dataflow-use-case-patterns-part-1>
|
||||||
|
- Part 2 <https://cloud.google.com/blog/products/data-analytics/guide-to-common-cloud-dataflow-use-case-patterns-part-2>
|
||||||
|
|
||||||
|
## Solution
|
||||||
|
|
||||||
|
A possible solution would be to leverage BigQuery to store the results of the mapping table in as the pipeline progresses. We can make use of BigQueries array type to literally store the raw array as we process each row.
|
||||||
|
|
||||||
|
In addition to creating the mapping table `(key, value)` pairs, we also save these pairs to BigQuery at this stage. We then yield the element as it is currently written to allow the subsequent stages to make use of this data.
|
||||||
|
|
||||||
|
Remove the condense mapping table stage as it is no longer needed (which also saves a bit of time).
|
||||||
|
|
||||||
|
Instead of using:
|
||||||
|
|
||||||
|
```python
|
||||||
|
beam.FlatMap(
|
||||||
|
insert_data_for_id, beam.pvalue.AsSingleton(mapping_table_condensed)
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
to insert the results of the mapping table we write a new `DoFn` that takes the element, and for each `id_all_columns` in the array we make a call to BigQuery to get the array for this ID and insert it at this stage.
|
||||||
|
|
||||||
|
Because each `id_all_columns` and its corresponding data is only used once, there would be no need to cache the results from BigQuery, however some work could be done to see if we could pull back more than one row at a time and cache these, saving time/costs in calls to BigQuery.
|
||||||
95
docs/discussion/approach.md
Normal file
95
docs/discussion/approach.md
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
# Approach
|
||||||
|
|
||||||
|
The general approach to the pipeline is:
|
||||||
|
|
||||||
|
## Loading stage
|
||||||
|
|
||||||
|
- Load using `#!python beam.io.ReadFromText()`
|
||||||
|
- Split the string loaded by `,` as it's a comma delimited `.csv`.
|
||||||
|
- Strip the leading/trailing `"` marks.
|
||||||
|
|
||||||
|
The result is an array with each element representing a single column in that row.
|
||||||
|
|
||||||
|
## Cleaning stage
|
||||||
|
|
||||||
|
Already discussed.
|
||||||
|
|
||||||
|
## Create a mapping table
|
||||||
|
|
||||||
|
The mapping table takes each row and creates a `(key,value)` pair with:
|
||||||
|
|
||||||
|
- The key being the id across all columns (`id_all_columns`).
|
||||||
|
- The value being the raw data as an array.
|
||||||
|
|
||||||
|
The mapping table is then condensed to a single dictionary with these key, value pairs (automatically deduplicating repeated rows) and is used as a side input further down the pipeline.
|
||||||
|
|
||||||
|
This mapping table is created to ensure the `GroupByKey` operation is as quick as possible. The more data you have to process in a `GroupByKey`, the longer the operation takes. By doing the `GroupByKey` using just the ids, the pipeline can process the files much quicker than if we included the raw data in this operation.
|
||||||
|
|
||||||
|
## Prepare stage
|
||||||
|
|
||||||
|
- Take the mapping table data (before it is condensed) and create a unique id ignoring the price and date (`id_without_price_date`).
|
||||||
|
|
||||||
|
This id will not be unique: for properties with more than one transaction they will share this id.
|
||||||
|
|
||||||
|
- Create a `(key, value)` pair with:
|
||||||
|
- The key being `id_without_price_date`.
|
||||||
|
- The value being `id_all_columns`.
|
||||||
|
- Group by `id_without_price_date`.
|
||||||
|
|
||||||
|
This results in a PCollection that looks like: `(id_without_price_date, [id_all_columns,...])`
|
||||||
|
|
||||||
|
- Deduplicate the `id_all_columns` inside this array to eliminate repeated rows that are exactly the same.
|
||||||
|
- Use the mapping table as a side input to reinsert the raw data using the `id_all_columns`.
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Example for No.1 B90 3LA</summary>
|
||||||
|
|
||||||
|
Mapping table (pre condensed):
|
||||||
|
|
||||||
|
```json
|
||||||
|
('fd4634faec47c29de40bbf7840723b41', ['317500', '2020-11-13 00:00', 'B90 3LA', '1', '', 'VERSTONE ROAD', 'SHIRLEY', 'SOLIHULL', 'SOLIHULL', 'WEST MIDLANDS', ''])
|
||||||
|
('fd4634faec47c29de40bbf7840723b41', ['317500', '2020-11-13 00:00', 'B90 3LA', '1', '', 'VERSTONE ROAD', 'SHIRLEY', 'SOLIHULL', 'SOLIHULL', 'WEST MIDLANDS', ''])
|
||||||
|
```
|
||||||
|
|
||||||
|
Mapping table (condensed):
|
||||||
|
|
||||||
|
```json
|
||||||
|
{'fd4634faec47c29de40bbf7840723b41': ['317500', '2020-11-13 00:00', 'B90 3LA', '1', '', 'VERSTONE ROAD', 'SHIRLEY', 'SOLIHULL', 'SOLIHULL', 'WEST MIDLANDS', '']}
|
||||||
|
```
|
||||||
|
|
||||||
|
Prepared (key, value):
|
||||||
|
|
||||||
|
```json
|
||||||
|
('fe205bfe66bc7f18c50c8f3d77ec3e30', 'fd4634faec47c29de40bbf7840723b41')
|
||||||
|
('fe205bfe66bc7f18c50c8f3d77ec3e30', 'fd4634faec47c29de40bbf7840723b41')
|
||||||
|
```
|
||||||
|
|
||||||
|
Prepared (GroupByKey):
|
||||||
|
|
||||||
|
```json
|
||||||
|
('fe205bfe66bc7f18c50c8f3d77ec3e30', ['fd4634faec47c29de40bbf7840723b41', 'fd4634faec47c29de40bbf7840723b41'])
|
||||||
|
```
|
||||||
|
|
||||||
|
Prepared (Deduplicated):
|
||||||
|
|
||||||
|
```json
|
||||||
|
('fe205bfe66bc7f18c50c8f3d77ec3e30', ['fd4634faec47c29de40bbf7840723b41'])
|
||||||
|
```
|
||||||
|
|
||||||
|
Use mapping table as side input:
|
||||||
|
|
||||||
|
```json
|
||||||
|
('fe205bfe66bc7f18c50c8f3d77ec3e30', ['317500', '2020-11-13 00:00', 'B90 3LA', '1', '', 'VERSTONE ROAD', 'SHIRLEY', 'SOLIHULL', 'SOLIHULL', 'WEST MIDLANDS', ''])
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
## Format stage
|
||||||
|
|
||||||
|
This stage takes the result and constructs a `json` object out of the grouped data. The schema for this output is discussed in the following page.
|
||||||
|
|
||||||
|
## Save stage
|
||||||
|
|
||||||
|
- The PCollection is combined with `#!python beam.combiners.ToList()`
|
||||||
|
- Apply `json.dumps()` for proper quotation marks for strings.
|
||||||
|
- Write to text with `#!python beam.io.WriteToText`.
|
||||||
154
docs/discussion/cleaning.md
Normal file
154
docs/discussion/cleaning.md
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
# Cleaning
|
||||||
|
|
||||||
|
In this page we discuss the cleaning stages and how best to prepare the data.
|
||||||
|
|
||||||
|
## Uniquely identify a property.
|
||||||
|
|
||||||
|
To uniquely identify a property with the data we have it is enough to have a Postcode and the PAON (or SAON or combination of both).
|
||||||
|
|
||||||
|
### Postcode
|
||||||
|
|
||||||
|
Because so few properties are missing a postcode (0.2% of all records) we will drop all rows that do not have one. We will drop some properties that could be identified uniquely with some more work, but the properties that are missing a postcode tend to be unusual/commercial/industrial (e.g a powerplant).
|
||||||
|
|
||||||
|
### PAON/SAON
|
||||||
|
|
||||||
|
The PAON has 3 possible formats:
|
||||||
|
|
||||||
|
- The street number.
|
||||||
|
- The building name.
|
||||||
|
- The building name and street number (comma delimited).
|
||||||
|
|
||||||
|
The SAON:
|
||||||
|
|
||||||
|
- Identifies the appartment/flat number for the building.
|
||||||
|
- If the SAON is present (only 11.7% of values) then the PAON will either be
|
||||||
|
- The building name.
|
||||||
|
- The building name and street number.
|
||||||
|
|
||||||
|
Because of the way the PAON and SOAN are defined, if any row is missing **both** of these columns we will drop it. As only having the postcode is not enough (generally speaking) to uniquely identify a property.
|
||||||
|
|
||||||
|
!!! tip
|
||||||
|
In a production environment we could send these rows to a sink table (in BigQuery for example), rather than drop them outright. Collecting these rows over time might show some patterns on how we can uniquely identify properties that are missing these fields.
|
||||||
|
|
||||||
|
We split the PAON as part of the cleaning stage. If the PAON contains a comma then it contains the building name and street number. We keep the street number in the same position as the PAON and insert the building name as a new column at the end of the row. If the PAON does not contain a comma we insert a blank column at the end to keep the number of columns in the PCollection consistent.
|
||||||
|
|
||||||
|
### Unneeded columns
|
||||||
|
|
||||||
|
To try keep computation costs/time down, I decided to drop the categorical columns provided. These include:
|
||||||
|
|
||||||
|
- Property Type.
|
||||||
|
- Old/New.
|
||||||
|
- Duration.
|
||||||
|
- PPD Category Type.
|
||||||
|
- Record Status - monthly file only.
|
||||||
|
|
||||||
|
Initially I was attempting to work against the full dataset so dropping these columns would make a difference in the amount of data that needs processing.
|
||||||
|
|
||||||
|
These columns are also not consistent. E.g the property `63` `B16, 0AE` has three transactions. Two of these transactions have a property type of `Other` and one transaction has a property type of `Terraced`.
|
||||||
|
|
||||||
|
These columns do provide some relevant information (old/new, duration, property type) and these could be included back into the pipeline fairly easily. Due to time constraints I was unable to make this change.
|
||||||
|
|
||||||
|
In addition, I also dropped the transaction unique identifier column. I wanted the IDs calculated in the pipeline to be consistent in format, and hashing a string (md5) isn't that expensive to calculate with complexity $\mathcal{O}(n)$.
|
||||||
|
|
||||||
|
### General cleaning
|
||||||
|
|
||||||
|
#### Upper case
|
||||||
|
|
||||||
|
As all strings in the dataset are upper case, we convert everything in the row to upper case to enforce consistency across the dataset.
|
||||||
|
|
||||||
|
#### Strip leading/trailing whitespace
|
||||||
|
|
||||||
|
We strip all leading/trailing whitespace from each column to enforce consistency.
|
||||||
|
|
||||||
|
#### Repeated rows
|
||||||
|
|
||||||
|
Some of the data is repeated:
|
||||||
|
|
||||||
|
- Some rows are repeated, with the same date + price + address information but with a unique transaction id.
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Example (PCollection)</summary>
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"fd4634faec47c29de40bbf7840723b41": [
|
||||||
|
"317500",
|
||||||
|
"2020-11-13 00:00",
|
||||||
|
"B90 3LA",
|
||||||
|
"1",
|
||||||
|
"",
|
||||||
|
"VERSTONE ROAD",
|
||||||
|
"SHIRLEY",
|
||||||
|
"SOLIHULL",
|
||||||
|
"SOLIHULL",
|
||||||
|
"WEST MIDLANDS",
|
||||||
|
""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"gd4634faec47c29de40bbf7840723b42": [
|
||||||
|
"317500",
|
||||||
|
"2020-11-13 00:00",
|
||||||
|
"B90 3LA",
|
||||||
|
"1",
|
||||||
|
"",
|
||||||
|
"VERSTONE ROAD",
|
||||||
|
"SHIRLEY",
|
||||||
|
"SOLIHULL",
|
||||||
|
"SOLIHULL",
|
||||||
|
"WEST MIDLANDS",
|
||||||
|
""
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
These rows will be deduplicated as part of the pipeline.
|
||||||
|
|
||||||
|
- Some rows have the same date + address information, but different prices.
|
||||||
|
|
||||||
|
It would be very unusual to see multiple transactions on the same date for the same property. One reason could be that there was a data entry error, resulting in two different transactions with only one being the real price. As the date column does not contain the time (it is fixed at `00:00`) it is impossible to tell.
|
||||||
|
|
||||||
|
Another reason could be missing building/flat/appartment information in this entry.
|
||||||
|
|
||||||
|
We **keep** these in the data, resulting in some properties having multiple transactions with different prices on the same date. Without a time or more information to go on, it is difficult to see how these could be filtered out.
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Example (Output)</summary>
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"property_id": "20d5c335c8d822a40baab0ecd57e92a4",
|
||||||
|
"readable_address": "53 PAVENHAM DRIVE\nBIRMINGHAM\nWEST MIDLANDS\nB5 7TN",
|
||||||
|
"flat_appartment": "",
|
||||||
|
"builing": "",
|
||||||
|
"number": "53",
|
||||||
|
"street": "PAVENHAM DRIVE",
|
||||||
|
"locality": "",
|
||||||
|
"town": "BIRMINGHAM",
|
||||||
|
"district": "BIRMINGHAM",
|
||||||
|
"county": "WEST MIDLANDS",
|
||||||
|
"postcode": "B5 7TN",
|
||||||
|
"property_transactions": [
|
||||||
|
{
|
||||||
|
"price": 270000,
|
||||||
|
"transaction_date": "2020-04-23",
|
||||||
|
"year": 2020
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"price": 364000,
|
||||||
|
"transaction_date": "2020-04-23",
|
||||||
|
"year": 2020
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"latest_transaction_year": 2020
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
30
docs/discussion/exploration.md
Normal file
30
docs/discussion/exploration.md
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
# Data Exploration Report
|
||||||
|
|
||||||
|
A brief exploration was done on the **full** dataset using the module `pandas-profiling`. The module uses `pandas` to load a dataset and automatically produce quantile/descriptive statistics, common values, extreme values, skew, kurtosis etc. and produces a report `.html` file that can be viewed interatively in your browser.
|
||||||
|
|
||||||
|
The script used to generate this report is located in `./exploration/report.py` and can be viewed below.
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>report.py</summary>
|
||||||
|
```python
|
||||||
|
--8<-- "exploration/report.py"
|
||||||
|
```
|
||||||
|
</details>
|
||||||
|
|
||||||
|
The report can be viewed by clicking the Data Exploration Report tab at the top of the page.
|
||||||
|
|
||||||
|
## Interesting observations
|
||||||
|
|
||||||
|
When looking at the report we are looking for data quality and missing observations. The statistics are interesting to see but are largely irrelevant for this task.
|
||||||
|
|
||||||
|
The data overall looks very good for a dataset of its size (~27 million records). For important fields there are no missing values:
|
||||||
|
|
||||||
|
- Every row has a price.
|
||||||
|
- Every row has a unique transaction ID.
|
||||||
|
- Every row has a transaction date.
|
||||||
|
|
||||||
|
Some fields that we will need are missing data:
|
||||||
|
|
||||||
|
- ~42,000 (0.2%) are missing a Postcode.
|
||||||
|
- ~4,000 (<0.1%) are missing a PAON (primary addressable object name).
|
||||||
|
- ~412,000 (1.6%) are missing a Street Name.
|
||||||
7
docs/discussion/introduction.md
Normal file
7
docs/discussion/introduction.md
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
# Introduction
|
||||||
|
|
||||||
|
This section will go through some discussion of the test including:
|
||||||
|
|
||||||
|
- Data exploration
|
||||||
|
- Cleaning the data
|
||||||
|
- Interpreting the results
|
||||||
51
docs/discussion/results.md
Normal file
51
docs/discussion/results.md
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
# Results
|
||||||
|
|
||||||
|
The resulting output `.json` looks like (for the previous example using No. 1 `B90 3LA`):
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"property_id": "fe205bfe66bc7f18c50c8f3d77ec3e30",
|
||||||
|
"readable_address": "1 VERSTONE ROAD\nSHIRLEY\nSOLIHULL\nWEST MIDLANDS\nB90 3LA",
|
||||||
|
"flat_appartment": "",
|
||||||
|
"builing": "",
|
||||||
|
"number": "1",
|
||||||
|
"street": "VERSTONE ROAD",
|
||||||
|
"locality": "SHIRLEY",
|
||||||
|
"town": "SOLIHULL",
|
||||||
|
"district": "SOLIHULL",
|
||||||
|
"county": "WEST MIDLANDS",
|
||||||
|
"postcode": "B90 3LA",
|
||||||
|
"property_transactions": [
|
||||||
|
{
|
||||||
|
"price": 317500,
|
||||||
|
"transaction_date": "2020-11-13",
|
||||||
|
"year": 2020
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"latest_transaction_year": 2020
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
The standard property information is included, we will briefly discuss the additional fields included in this output file.
|
||||||
|
|
||||||
|
## readable_address
|
||||||
|
|
||||||
|
The components that make up the address in the dataset are often repetitive, with the locality, town/city, district and county often sharing the same result. This can result in hard to read addresses if we just stacked all the components sequentially.
|
||||||
|
|
||||||
|
The `readable_address` provides an easy to read address that strips this repetiveness out, by doing pairwise comparisons to each of the four components and applying a mask. The result is an address that could be served to the end user, or easily displayed on a page.
|
||||||
|
|
||||||
|
This saves any user having to apply the same logic to simply display the address somewhere, the full address of a property should be easy to read and easily accessible.
|
||||||
|
|
||||||
|
## property_transactions
|
||||||
|
|
||||||
|
This array contains an object for each transaction for that property that has the price and year as an `int`, with the date having the `00:00` time stripped out.
|
||||||
|
|
||||||
|
## latest_transaction_year
|
||||||
|
|
||||||
|
The date of the latest transaction is extracted from the array of `property_transactions` and placed in the top level of the `json` object. This allows any end user to easily search for properties that haven't been sold in a period of time, without having to write this logic themselves.
|
||||||
|
|
||||||
|
A consumer should be able to use this data to answer questions like:
|
||||||
|
|
||||||
|
- Give me all properties in the town of Solihull that haven't been sold in the past 10 years.
|
||||||
31
docs/documentation/installation.md
Normal file
31
docs/documentation/installation.md
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
# Installation
|
||||||
|
|
||||||
|
The task is written in Python 3.7.9 using Apache Beam 2.32.0. Python versions 3.6.14 and 3.8.11 should also be compatible but have not been tested.
|
||||||
|
|
||||||
|
The task has been tested on MacOS Big Sur and WSL2. The task should run on Windows but this wasn't tested.
|
||||||
|
|
||||||
|
For Beam 2.32.0 the supported versions of the Python SDK can be found [here](https://cloud.google.com/dataflow/docs/concepts/sdk-worker-dependencies#sdk-for-python).
|
||||||
|
|
||||||
|
## Pip
|
||||||
|
|
||||||
|
In a virtual environment run from the root of the repo:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
## Poetry (Alternative)
|
||||||
|
|
||||||
|
Install [Poetry](https://python-poetry.org) *globally*
|
||||||
|
|
||||||
|
From the root of the repo install the dependencies with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
poetry install --no-dev
|
||||||
|
```
|
||||||
|
|
||||||
|
Activate the shell with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
poetry shell
|
||||||
|
```
|
||||||
59
docs/documentation/usage.md
Normal file
59
docs/documentation/usage.md
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
# Usage
|
||||||
|
|
||||||
|
This page documents how to run the pipeline locally to complete the task for the [dataset for 2020](https://www.gov.uk/government/statistical-data-sets/price-paid-data-downloads#section-1).
|
||||||
|
|
||||||
|
The pipeline also runs in GCP using DataFlow and is discussed further on but can be viewed [here](../dataflow/index.md). We also discuss how to adapt the pipeline so it can run against [the full dataset](https://www.gov.uk/government/statistical-data-sets/price-paid-data-downloads#single-file).
|
||||||
|
|
||||||
|
## Download dataset
|
||||||
|
|
||||||
|
The input data by default should go in `./data/input`.
|
||||||
|
|
||||||
|
For convenience the data is available publicly in a GCP Cloud Storage bucket.
|
||||||
|
|
||||||
|
Run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
wget https://storage.googleapis.com/street-group-technical-test-dmot-euw1/input/pp-2020.csv -P data/input
|
||||||
|
```
|
||||||
|
|
||||||
|
to download the data for 2020 and place in the input directory above.
|
||||||
|
|
||||||
|
## Entrypoint
|
||||||
|
|
||||||
|
The entrypoint to the pipeline is `analyse_properties.main`.
|
||||||
|
|
||||||
|
## Available options
|
||||||
|
|
||||||
|
Running
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m analyse_properties.main --help
|
||||||
|
```
|
||||||
|
|
||||||
|
gives the following output:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
usage: analyse_properties.main [-h] [--input INPUT] [--output OUTPUT]
|
||||||
|
|
||||||
|
optional arguments:
|
||||||
|
-h, --help show this help message and exit
|
||||||
|
--input INPUT Full path to the input file.
|
||||||
|
--output OUTPUT Full path to the output file without extension.
|
||||||
|
```
|
||||||
|
|
||||||
|
The default value for input is `./data/input/pp-2020.csv` and the default value for output is `./data/output/pp-2020`.
|
||||||
|
|
||||||
|
## Run the pipeline
|
||||||
|
|
||||||
|
To run the pipeline and complete the task run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m analyse_properties.main \
|
||||||
|
--runner DirectRunner \
|
||||||
|
--input ./data/input/pp-2020.csv \
|
||||||
|
--output ./data/output/pp-2020
|
||||||
|
```
|
||||||
|
|
||||||
|
from the root of the repo.
|
||||||
|
|
||||||
|
The pipeline will use the 2020 dataset located in `./data/input` and output the resulting `.json` to `./data/output`.
|
||||||
@@ -3,3 +3,10 @@
|
|||||||
## Introduction
|
## Introduction
|
||||||
|
|
||||||
This documentation accompanies the technical test for the Street Group.
|
This documentation accompanies the technical test for the Street Group.
|
||||||
|
|
||||||
|
The following pages will guide the user through installing the requirements, and running the task to complete the test. In addition, there is some discussion around the approach, and scaling the pipeline.
|
||||||
|
|
||||||
|
Navigate sections using the tabs at the top of the page. Pages in this section can be viewed in order by using the section links in the left menu, or by using bar at the bottom of the page. The table of contents in the right menu can be used to navigate sections on each page.
|
||||||
|
|
||||||
|
!!! note
|
||||||
|
All paths in this documentation, e.g `./analyse_properties/data/output` refer to the location of the directory/file from the root of the repo.
|
||||||
|
|||||||
@@ -1,5 +1,8 @@
|
|||||||
# Full data set
|
# Full data set
|
||||||
wget https://storage.googleapis.com/street-group-technical-test-dmot/pp-complete.csv -P analyse_properties/data/input
|
# wget https://storage.googleapis.com/street-group-technical-test-dmot-euw1/input/pp-complete.csv -P data/input
|
||||||
|
|
||||||
# Monthly update data set
|
# Monthly update data set
|
||||||
# wget https://storage.googleapis.com/street-group-technical-test-dmot/pp-monthly-update-new-version.csv -P analyse_properties/data/input
|
# wget https://storage.googleapis.com/street-group-technical-test-dmot-euw1/input/pp-monthly-update-new-version.csv -P data/input
|
||||||
|
|
||||||
|
# 2020 data set
|
||||||
|
wget https://storage.googleapis.com/street-group-technical-test-dmot-euw1/input/pp-2020.csv -P data/input
|
||||||
|
|||||||
@@ -1,13 +1,16 @@
|
|||||||
from importlib import resources
|
import pathlib
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas_profiling import ProfileReport
|
from pandas_profiling import ProfileReport
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
with resources.path("analyse_properties.data", "pp-complete.csv") as csv_file:
|
input_file = (
|
||||||
|
pathlib.Path(__file__).parents[1] / "data" / "input" / "pp-complete.csv"
|
||||||
|
)
|
||||||
|
with input_file.open() as csv:
|
||||||
df_report = pd.read_csv(
|
df_report = pd.read_csv(
|
||||||
csv_file,
|
csv,
|
||||||
names=[
|
names=[
|
||||||
"transaction_id",
|
"transaction_id",
|
||||||
"price",
|
"price",
|
||||||
|
|||||||
35
mkdocs.yaml
35
mkdocs.yaml
@@ -4,37 +4,40 @@ use_directory_urls: false
|
|||||||
nav:
|
nav:
|
||||||
- Documentation:
|
- Documentation:
|
||||||
- Welcome: index.md
|
- Welcome: index.md
|
||||||
# - Installation: documentation/installation.md
|
- Installation: documentation/installation.md
|
||||||
# - Usage: documentation/usage.md
|
- Usage: documentation/usage.md
|
||||||
# - Comments and Caveats:
|
- Discussion:
|
||||||
# - Introduction: comments_caveats/introduction.md
|
- Introduction: discussion/introduction.md
|
||||||
# - Time limit: comments_caveats/time_limit.md
|
- Data Exploration Report: discussion/exploration.md
|
||||||
# - Third party libraries: comments_caveats/third_party_libraries.md
|
- Cleaning: discussion/cleaning.md
|
||||||
# - Areas of improvement and comments: comments_caveats/area_of_improvement_comments.md
|
- Approach: discussion/approach.md
|
||||||
# - Similar names algorithm: comments_caveats/similar_names.md
|
- Results: discussion/results.md
|
||||||
# - Reference:
|
- DataFlow:
|
||||||
# - deduplicator.main: reference/api_documentation_main.md
|
- Running on DataFlow: dataflow/index.md
|
||||||
# - Changelog: changelog/changelog.md
|
- Scaling to the Full DataSet: dataflow/scaling.md
|
||||||
|
- Data Exploration Report: pandas-profiling/report.html
|
||||||
theme:
|
theme:
|
||||||
name: material
|
name: material
|
||||||
palette:
|
palette:
|
||||||
primary: indigo
|
primary: indigo
|
||||||
accent: blue
|
accent: blue
|
||||||
feature:
|
features:
|
||||||
tabs: true
|
navigation.tabs: true
|
||||||
markdown_extensions:
|
markdown_extensions:
|
||||||
- admonition
|
- admonition
|
||||||
- codehilite:
|
- codehilite:
|
||||||
guess_lang: true
|
guess_lang: true
|
||||||
- toc:
|
- toc:
|
||||||
permalink: true
|
permalink: true
|
||||||
|
- pymdownx.highlight
|
||||||
- pymdownx.superfences
|
- pymdownx.superfences
|
||||||
# - pymdownx.arithmatex:
|
- pymdownx.inlinehilite
|
||||||
# generic: true
|
- pymdownx.snippets
|
||||||
|
- pymdownx.arithmatex:
|
||||||
|
generic: true
|
||||||
plugins:
|
plugins:
|
||||||
- search:
|
- search:
|
||||||
lang: en
|
lang: en
|
||||||
extra_javascript:
|
extra_javascript:
|
||||||
- javascripts/config.js
|
|
||||||
- https://polyfill.io/v3/polyfill.min.js?features=es6
|
- https://polyfill.io/v3/polyfill.min.js?features=es6
|
||||||
- https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js
|
- https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js
|
||||||
|
|||||||
6
notes/commands.md
Normal file
6
notes/commands.md
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
# Commands
|
||||||
|
|
||||||
|
## mkdocs
|
||||||
|
|
||||||
|
`mkdocs serve`
|
||||||
|
`mkdocs gh-deploy`
|
||||||
95
notes/documentation/dataflow.md
Normal file
95
notes/documentation/dataflow.md
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
# DataFlow
|
||||||
|
|
||||||
|
<https://cloud.google.com/dataflow/docs/quickstarts/quickstart-python>
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
Full example of beam pipeline on dataflow:
|
||||||
|
|
||||||
|
<https://github.com/apache/beam/tree/master/sdks/python/apache_beam/examples/complete/juliaset>
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
Export env variable:
|
||||||
|
|
||||||
|
`export GOOGLE_APPLICATION_CREDENTIALS="/home/dtomlinson/git-repos/work/street_group/street_group_tech_test/street-group-0c490d23a9d0.json"`
|
||||||
|
|
||||||
|
## Run pipeline
|
||||||
|
|
||||||
|
### Dataflow
|
||||||
|
|
||||||
|
#### Yearly dataset
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m analyse_properties.main \
|
||||||
|
--runner DataflowRunner \
|
||||||
|
--project street-group \
|
||||||
|
--region europe-west1 \
|
||||||
|
--input gs://street-group-technical-test-dmot-euw1/input/pp-2020.csv \
|
||||||
|
--output gs://street-group-technical-test-dmot-euw1/output/pp-2020 \
|
||||||
|
--temp_location gs://street-group-technical-test-dmot-euw1/tmp \
|
||||||
|
--subnetwork=https://www.googleapis.com/compute/v1/projects/street-group/regions/europe-west1/subnetworks/europe-west-1-dataflow \
|
||||||
|
--no_use_public_ips \
|
||||||
|
--worker_machine_type=n1-highmem-2
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Full dataset
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m analyse_properties.main \
|
||||||
|
--region europe-west1 \
|
||||||
|
--input gs://street-group-technical-test-dmot-euw1/input/pp-complete.csv \
|
||||||
|
--output gs://street-group-technical-test-dmot-euw1/output/pp-complete \
|
||||||
|
--runner DataflowRunner \
|
||||||
|
--project street-group \
|
||||||
|
--temp_location gs://street-group-technical-test-dmot-euw1/tmp \
|
||||||
|
--subnetwork=https://www.googleapis.com/compute/v1/projects/street-group/regions/europe-west1/subnetworks/europe-west-1-dataflow \
|
||||||
|
--no_use_public_ips \
|
||||||
|
--worker_machine_type=n1-highmem-8 \
|
||||||
|
--num_workers=3 \
|
||||||
|
--autoscaling_algorithm=NONE
|
||||||
|
```
|
||||||
|
|
||||||
|
### Locally
|
||||||
|
|
||||||
|
Run the pipeline locally:
|
||||||
|
|
||||||
|
`python -m analyse_properties.main --runner DirectRunner`
|
||||||
|
|
||||||
|
## Errors
|
||||||
|
|
||||||
|
Unsubscriptable error on window:
|
||||||
|
|
||||||
|
<https://stackoverflow.com/questions/42276520/what-does-object-of-type-unwindowedvalues-has-no-len-mean>
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
|
||||||
|
Running in its own private VPC without public IPs
|
||||||
|
|
||||||
|
- <https://stackoverflow.com/questions/58893082/which-compute-engine-quotas-need-to-be-updated-to-run-dataflow-with-50-workers>
|
||||||
|
- <https://cloud.google.com/dataflow/docs/guides/specifying-networks#subnetwork_parameter>
|
||||||
|
|
||||||
|
Error help
|
||||||
|
|
||||||
|
- <https://cloud.google.com/dataflow/docs/guides/common-errors>
|
||||||
|
- <https://cloud.google.com/dataflow/docs/guides/troubleshooting-your-pipeline>
|
||||||
|
|
||||||
|
Scaling
|
||||||
|
|
||||||
|
Using DataFlowPrime: <https://cloud.google.com/dataflow/docs/guides/enable-dataflow-prime#enable-prime>
|
||||||
|
Use `--experiments=enable_prime`
|
||||||
|
|
||||||
|
Deploying a pipeline (with scaling options): <https://cloud.google.com/dataflow/docs/guides/deploying-a-pipeline>
|
||||||
|
|
||||||
|
Available VM types (with pricing): <https://cloud.google.com/compute/vm-instance-pricing#n1_predefined>
|
||||||
|
|
||||||
|
Performance
|
||||||
|
|
||||||
|
Sideinput performance: <https://stackoverflow.com/questions/48242320/google-dataflow-apache-beam-python-side-input-from-pcollection-kills-perform>
|
||||||
|
|
||||||
|
Common use cases:
|
||||||
|
|
||||||
|
- Part 1 <https://cloud.google.com/blog/products/data-analytics/guide-to-common-cloud-dataflow-use-case-patterns-part-1>
|
||||||
|
- Part 2 <https://cloud.google.com/blog/products/data-analytics/guide-to-common-cloud-dataflow-use-case-patterns-part-2>
|
||||||
|
|
||||||
|
Side inputs: <https://cloud.google.com/architecture/e-commerce/patterns/slow-updating-side-inputs>
|
||||||
27
notes/tmp/errordata
Normal file
27
notes/tmp/errordata
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
"Error message from worker: Traceback (most recent call last):
|
||||||
|
File "/usr/local/lib/python3.7/site-packages/dataflow_worker/batchworker.py", line 651, in do_work
|
||||||
|
work_executor.execute()
|
||||||
|
File "/usr/local/lib/python3.7/site-packages/dataflow_worker/executor.py", line 181, in execute
|
||||||
|
op.finish()
|
||||||
|
File "dataflow_worker/native_operations.py", line 93, in dataflow_worker.native_operations.NativeWriteOperation.finish
|
||||||
|
File "dataflow_worker/native_operations.py", line 94, in dataflow_worker.native_operations.NativeWriteOperation.finish
|
||||||
|
File "dataflow_worker/native_operations.py", line 95, in dataflow_worker.native_operations.NativeWriteOperation.finish
|
||||||
|
File "/usr/local/lib/python3.7/site-packages/dataflow_worker/nativeavroio.py", line 308, in __exit__
|
||||||
|
self._data_file_writer.flush()
|
||||||
|
File "fastavro/_write.pyx", line 664, in fastavro._write.Writer.flush
|
||||||
|
File "fastavro/_write.pyx", line 639, in fastavro._write.Writer.dump
|
||||||
|
File "fastavro/_write.pyx", line 451, in fastavro._write.snappy_write_block
|
||||||
|
File "fastavro/_write.pyx", line 458, in fastavro._write.snappy_write_block
|
||||||
|
File "/usr/local/lib/python3.7/site-packages/apache_beam/io/filesystemio.py", line 200, in write
|
||||||
|
self._uploader.put(b)
|
||||||
|
File "/usr/local/lib/python3.7/site-packages/apache_beam/io/gcp/gcsio.py", line 720, in put
|
||||||
|
self._conn.send_bytes(data.tobytes())
|
||||||
|
File "/usr/local/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
|
||||||
|
self._send_bytes(m[offset:offset + size])
|
||||||
|
File "/usr/local/lib/python3.7/multiprocessing/connection.py", line 393, in _send_bytes
|
||||||
|
header = struct.pack("!i", n)
|
||||||
|
struct.error: 'i' format requires -2147483648 <= number <= 2147483647
|
||||||
|
"
|
||||||
|
|
||||||
|
|
||||||
|
"Out of memory: Killed process 2042 (python) total-vm:28616496kB, anon-rss:25684136kB, file-rss:0kB, shmem-rss:0kB, UID:0 pgtables:51284kB oom_score_adj:900"
|
||||||
44
notes/tmp/exampledata
Normal file
44
notes/tmp/exampledata
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
[{
|
||||||
|
"property_id": "3cf3c06632c46754696f2017933702f3",
|
||||||
|
"flat_appartment": "",
|
||||||
|
"builing": "",
|
||||||
|
"number": "63",
|
||||||
|
"street": "ROTTON PARK STREET",
|
||||||
|
"locality": "",
|
||||||
|
"town": "BIRMINGHAM",
|
||||||
|
"district": "BIRMINGHAM",
|
||||||
|
"county": "WEST MIDLANDS",
|
||||||
|
"postcode": "B16 0AE",
|
||||||
|
"property_transactions": [
|
||||||
|
{ "price": "385000", "transaction_date": "2021-01-08", "year": "2021" },
|
||||||
|
{ "price": "701985", "transaction_date": "2019-03-28", "year": "2019" },
|
||||||
|
{ "price": "1748761", "transaction_date": "2020-05-27", "year": "2020" }
|
||||||
|
],
|
||||||
|
"latest_transaction_year": "2021"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"property_id": "c650d5d7bb0daf0a19bb2cacabbee74e",
|
||||||
|
"readable_address": "16 STATION ROAD\nPARKGATE\nNESTON\nCHESHIRE WEST AND CHESTER\nCH64 6QJ",
|
||||||
|
"flat_appartment": "",
|
||||||
|
"builing": "",
|
||||||
|
"number": "16",
|
||||||
|
"street": "STATION ROAD",
|
||||||
|
"locality": "PARKGATE",
|
||||||
|
"town": "NESTON",
|
||||||
|
"district": "CHESHIRE WEST AND CHESTER",
|
||||||
|
"county": "CHESHIRE WEST AND CHESTER",
|
||||||
|
"postcode": "CH64 6QJ",
|
||||||
|
"property_transactions": [
|
||||||
|
{
|
||||||
|
"price": "280000",
|
||||||
|
"transaction_date": "2020-11-30",
|
||||||
|
"year": "2020"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"price": "265000",
|
||||||
|
"transaction_date": "2020-05-29",
|
||||||
|
"year": "2020"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"latest_transaction_year": "2020"
|
||||||
|
}]
|
||||||
16
notes/tmp/runningdata
Normal file
16
notes/tmp/runningdata
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
|
||||||
|
|
||||||
|
Create Mapping table
|
||||||
|
('fd4634faec47c29de40bbf7840723b41', ['317500', '2020-11-13 00:00', 'B90 3LA', '1', '', 'VERSTONE ROAD', 'SHIRLEY', 'SOLIHULL', 'SOLIHULL', 'WEST MIDLANDS', ''])
|
||||||
|
('fd4634faec47c29de40bbf7840723b41', ['317500', '2020-11-13 00:00', 'B90 3LA', '1', '', 'VERSTONE ROAD', 'SHIRLEY', 'SOLIHULL', 'SOLIHULL', 'WEST MIDLANDS', ''])
|
||||||
|
|
||||||
|
Condensing
|
||||||
|
{'fd4634faec47c29de40bbf7840723b41': ['317500', '2020-11-13 00:00', 'B90 3LA', '1', '', 'VERSTONE ROAD', 'SHIRLEY', 'SOLIHULL', 'SOLIHULL', 'WEST MIDLANDS', '']}
|
||||||
|
|
||||||
|
|
||||||
|
Prepared
|
||||||
|
GroupByKey
|
||||||
|
('fe205bfe66bc7f18c50c8f3d77ec3e30', ['fd4634faec47c29de40bbf7840723b41', 'fd4634faec47c29de40bbf7840723b41'])
|
||||||
|
|
||||||
|
deduplicated
|
||||||
|
('fe205bfe66bc7f18c50c8f3d77ec3e30', ['fd4634faec47c29de40bbf7840723b41'])
|
||||||
141
poetry.lock
generated
141
poetry.lock
generated
@@ -136,7 +136,7 @@ unicode_backport = ["unicodedata2"]
|
|||||||
name = "click"
|
name = "click"
|
||||||
version = "8.0.1"
|
version = "8.0.1"
|
||||||
description = "Composable command line interface toolkit"
|
description = "Composable command line interface toolkit"
|
||||||
category = "main"
|
category = "dev"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.6"
|
python-versions = ">=3.6"
|
||||||
|
|
||||||
@@ -148,7 +148,7 @@ importlib-metadata = {version = "*", markers = "python_version < \"3.8\""}
|
|||||||
name = "colorama"
|
name = "colorama"
|
||||||
version = "0.4.4"
|
version = "0.4.4"
|
||||||
description = "Cross-platform colored terminal text."
|
description = "Cross-platform colored terminal text."
|
||||||
category = "main"
|
category = "dev"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
||||||
|
|
||||||
@@ -258,9 +258,9 @@ python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ghp-import"
|
name = "ghp-import"
|
||||||
version = "2.0.1"
|
version = "2.0.2"
|
||||||
description = "Copy your docs directly to the gh-pages branch."
|
description = "Copy your docs directly to the gh-pages branch."
|
||||||
category = "main"
|
category = "dev"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "*"
|
python-versions = "*"
|
||||||
|
|
||||||
@@ -268,7 +268,7 @@ python-versions = "*"
|
|||||||
python-dateutil = ">=2.8.1"
|
python-dateutil = ">=2.8.1"
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
dev = ["twine", "markdown", "flake8"]
|
dev = ["twine", "markdown", "flake8", "wheel"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "google-api-core"
|
name = "google-api-core"
|
||||||
@@ -535,7 +535,7 @@ grpcio = ">=1.0.0,<2.0.0dev"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "grpcio"
|
name = "grpcio"
|
||||||
version = "1.40.0"
|
version = "1.41.0"
|
||||||
description = "HTTP/2-based RPC framework"
|
description = "HTTP/2-based RPC framework"
|
||||||
category = "main"
|
category = "main"
|
||||||
optional = false
|
optional = false
|
||||||
@@ -545,7 +545,7 @@ python-versions = "*"
|
|||||||
six = ">=1.5.2"
|
six = ">=1.5.2"
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
protobuf = ["grpcio-tools (>=1.40.0)"]
|
protobuf = ["grpcio-tools (>=1.41.0)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "grpcio-gcp"
|
name = "grpcio-gcp"
|
||||||
@@ -622,7 +622,7 @@ six = "*"
|
|||||||
name = "importlib-metadata"
|
name = "importlib-metadata"
|
||||||
version = "4.8.1"
|
version = "4.8.1"
|
||||||
description = "Read metadata from Python packages"
|
description = "Read metadata from Python packages"
|
||||||
category = "main"
|
category = "dev"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.6"
|
python-versions = ">=3.6"
|
||||||
|
|
||||||
@@ -653,7 +653,7 @@ plugins = ["setuptools"]
|
|||||||
name = "jinja2"
|
name = "jinja2"
|
||||||
version = "3.0.1"
|
version = "3.0.1"
|
||||||
description = "A very fast and expressive template engine."
|
description = "A very fast and expressive template engine."
|
||||||
category = "main"
|
category = "dev"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.6"
|
python-versions = ">=3.6"
|
||||||
|
|
||||||
@@ -691,7 +691,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
|
|||||||
name = "markdown"
|
name = "markdown"
|
||||||
version = "3.3.4"
|
version = "3.3.4"
|
||||||
description = "Python implementation of Markdown."
|
description = "Python implementation of Markdown."
|
||||||
category = "main"
|
category = "dev"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.6"
|
python-versions = ">=3.6"
|
||||||
|
|
||||||
@@ -705,7 +705,7 @@ testing = ["coverage", "pyyaml"]
|
|||||||
name = "markupsafe"
|
name = "markupsafe"
|
||||||
version = "2.0.1"
|
version = "2.0.1"
|
||||||
description = "Safely add untrusted strings to HTML/XML markup."
|
description = "Safely add untrusted strings to HTML/XML markup."
|
||||||
category = "main"
|
category = "dev"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.6"
|
python-versions = ">=3.6"
|
||||||
|
|
||||||
@@ -737,7 +737,7 @@ python-versions = "*"
|
|||||||
name = "mergedeep"
|
name = "mergedeep"
|
||||||
version = "1.3.4"
|
version = "1.3.4"
|
||||||
description = "A deep merge function for 🐍."
|
description = "A deep merge function for 🐍."
|
||||||
category = "main"
|
category = "dev"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.6"
|
python-versions = ">=3.6"
|
||||||
|
|
||||||
@@ -762,7 +762,7 @@ tests = ["pytest", "pytest-mpl"]
|
|||||||
name = "mkdocs"
|
name = "mkdocs"
|
||||||
version = "1.2.2"
|
version = "1.2.2"
|
||||||
description = "Project documentation with Markdown."
|
description = "Project documentation with Markdown."
|
||||||
category = "main"
|
category = "dev"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.6"
|
python-versions = ">=3.6"
|
||||||
|
|
||||||
@@ -785,7 +785,7 @@ i18n = ["babel (>=2.9.0)"]
|
|||||||
name = "mkdocs-material"
|
name = "mkdocs-material"
|
||||||
version = "7.3.0"
|
version = "7.3.0"
|
||||||
description = "A Material Design theme for MkDocs"
|
description = "A Material Design theme for MkDocs"
|
||||||
category = "main"
|
category = "dev"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "*"
|
python-versions = "*"
|
||||||
|
|
||||||
@@ -800,7 +800,7 @@ pymdown-extensions = ">=7.0"
|
|||||||
name = "mkdocs-material-extensions"
|
name = "mkdocs-material-extensions"
|
||||||
version = "1.0.3"
|
version = "1.0.3"
|
||||||
description = "Extension pack for Python Markdown."
|
description = "Extension pack for Python Markdown."
|
||||||
category = "main"
|
category = "dev"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.6"
|
python-versions = ">=3.6"
|
||||||
|
|
||||||
@@ -956,7 +956,7 @@ python-versions = ">=3.6"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "platformdirs"
|
name = "platformdirs"
|
||||||
version = "2.3.0"
|
version = "2.4.0"
|
||||||
description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
|
description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
|
||||||
category = "dev"
|
category = "dev"
|
||||||
optional = false
|
optional = false
|
||||||
@@ -1111,7 +1111,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
|||||||
name = "pygments"
|
name = "pygments"
|
||||||
version = "2.10.0"
|
version = "2.10.0"
|
||||||
description = "Pygments is a syntax highlighting package written in Python."
|
description = "Pygments is a syntax highlighting package written in Python."
|
||||||
category = "main"
|
category = "dev"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.5"
|
python-versions = ">=3.5"
|
||||||
|
|
||||||
@@ -1186,7 +1186,7 @@ pylint = ">=1.7"
|
|||||||
name = "pymdown-extensions"
|
name = "pymdown-extensions"
|
||||||
version = "8.2"
|
version = "8.2"
|
||||||
description = "Extension pack for Python Markdown."
|
description = "Extension pack for Python Markdown."
|
||||||
category = "main"
|
category = "dev"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.6"
|
python-versions = ">=3.6"
|
||||||
|
|
||||||
@@ -1253,7 +1253,7 @@ numpy = ">=1.13.3"
|
|||||||
name = "pyyaml"
|
name = "pyyaml"
|
||||||
version = "5.4.1"
|
version = "5.4.1"
|
||||||
description = "YAML parser and emitter for Python"
|
description = "YAML parser and emitter for Python"
|
||||||
category = "main"
|
category = "dev"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
|
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
|
||||||
|
|
||||||
@@ -1261,7 +1261,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
|
|||||||
name = "pyyaml-env-tag"
|
name = "pyyaml-env-tag"
|
||||||
version = "0.1"
|
version = "0.1"
|
||||||
description = "A custom YAML tag for referencing environment variables in YAML files. "
|
description = "A custom YAML tag for referencing environment variables in YAML files. "
|
||||||
category = "main"
|
category = "dev"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.6"
|
python-versions = ">=3.6"
|
||||||
|
|
||||||
@@ -1452,7 +1452,7 @@ type_image_path = ["imagehash", "pillow"]
|
|||||||
name = "watchdog"
|
name = "watchdog"
|
||||||
version = "2.1.5"
|
version = "2.1.5"
|
||||||
description = "Filesystem events monitoring"
|
description = "Filesystem events monitoring"
|
||||||
category = "main"
|
category = "dev"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.6"
|
python-versions = ">=3.6"
|
||||||
|
|
||||||
@@ -1471,7 +1471,7 @@ python-versions = "*"
|
|||||||
name = "zipp"
|
name = "zipp"
|
||||||
version = "3.5.0"
|
version = "3.5.0"
|
||||||
description = "Backport of pathlib-compatible object wrapper for zip files"
|
description = "Backport of pathlib-compatible object wrapper for zip files"
|
||||||
category = "main"
|
category = "dev"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.6"
|
python-versions = ">=3.6"
|
||||||
|
|
||||||
@@ -1482,7 +1482,7 @@ testing = ["pytest (>=4.6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytes
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "1.1"
|
lock-version = "1.1"
|
||||||
python-versions = "^3.7"
|
python-versions = "^3.7"
|
||||||
content-hash = "c9292b385b6067c194a7e31bc62ea4c04c99b951d3f4fa1b9b8f081ddf270c4c"
|
content-hash = "c710ab077268b067a2d2e900a7ca426bac3a9d9512d63ef3b517cd0e55477329"
|
||||||
|
|
||||||
[metadata.files]
|
[metadata.files]
|
||||||
apache-beam = [
|
apache-beam = [
|
||||||
@@ -1601,7 +1601,8 @@ future = [
|
|||||||
{file = "future-0.18.2.tar.gz", hash = "sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d"},
|
{file = "future-0.18.2.tar.gz", hash = "sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d"},
|
||||||
]
|
]
|
||||||
ghp-import = [
|
ghp-import = [
|
||||||
{file = "ghp-import-2.0.1.tar.gz", hash = "sha256:753de2eace6e0f7d4edfb3cce5e3c3b98cd52aadb80163303d1d036bda7b4483"},
|
{file = "ghp-import-2.0.2.tar.gz", hash = "sha256:947b3771f11be850c852c64b561c600fdddf794bab363060854c1ee7ad05e071"},
|
||||||
|
{file = "ghp_import-2.0.2-py3-none-any.whl", hash = "sha256:5f8962b30b20652cdffa9c5a9812f7de6bcb56ec475acac579807719bf242c46"},
|
||||||
]
|
]
|
||||||
google-api-core = [
|
google-api-core = [
|
||||||
{file = "google-api-core-1.31.2.tar.gz", hash = "sha256:8500aded318fdb235130bf183c726a05a9cb7c4b09c266bd5119b86cdb8a4d10"},
|
{file = "google-api-core-1.31.2.tar.gz", hash = "sha256:8500aded318fdb235130bf183c726a05a9cb7c4b09c266bd5119b86cdb8a4d10"},
|
||||||
@@ -1716,50 +1717,50 @@ grpc-google-iam-v1 = [
|
|||||||
{file = "grpc-google-iam-v1-0.12.3.tar.gz", hash = "sha256:0bfb5b56f648f457021a91c0df0db4934b6e0c300bd0f2de2333383fe958aa72"},
|
{file = "grpc-google-iam-v1-0.12.3.tar.gz", hash = "sha256:0bfb5b56f648f457021a91c0df0db4934b6e0c300bd0f2de2333383fe958aa72"},
|
||||||
]
|
]
|
||||||
grpcio = [
|
grpcio = [
|
||||||
{file = "grpcio-1.40.0-cp35-cp35m-macosx_10_10_intel.whl", hash = "sha256:6f8f581787e739945e6cda101f312ea8a7e7082bdbb4993901eb828da6a49092"},
|
{file = "grpcio-1.41.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:9ecd0fc34aa46eeac24f4d20e67bafaf72ca914f99690bf2898674905eaddaf9"},
|
||||||
{file = "grpcio-1.40.0-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:a4389e26a8f9338ca91effdc5436dfec67d6ecd296368dba115799ae8f8e5bdb"},
|
{file = "grpcio-1.41.0-cp310-cp310-macosx_10_10_universal2.whl", hash = "sha256:d539ebd05a2bbfbf897d41738d37d162d5c3d9f2b1f8ddf2c4f75e2c9cf59907"},
|
||||||
{file = "grpcio-1.40.0-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:fb06708e3d173e387326abcd5182d52beb60e049db5c3d317bd85509e938afdc"},
|
{file = "grpcio-1.41.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:2410000eb57cf76b05b37d2aee270b686f0a7876710850a2bba92b4ed133e026"},
|
||||||
{file = "grpcio-1.40.0-cp35-cp35m-manylinux2014_i686.whl", hash = "sha256:f06e07161c21391682bfcac93a181a037a8aa3d561546690e9d0501189729aac"},
|
{file = "grpcio-1.41.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:be3c6ac822edb509aeef41361ca9c8c5ee52cb9e4973e1977d2bb7d6a460fd97"},
|
||||||
{file = "grpcio-1.40.0-cp35-cp35m-manylinux2014_x86_64.whl", hash = "sha256:5ff0dcf66315f3f00e1a8eb7244c6a49bdb0cc59bef4fb65b9db8adbd78e6acb"},
|
{file = "grpcio-1.41.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0c4bdd1d646365d10ba1468bcf234ea5ad46e8ce2b115983e8563248614910a"},
|
||||||
{file = "grpcio-1.40.0-cp35-cp35m-win32.whl", hash = "sha256:ba9dd97ea1738be3e81d34e6bab8ff91a0b80668a4ec81454b283d3c828cebde"},
|
{file = "grpcio-1.41.0-cp310-cp310-win32.whl", hash = "sha256:7033199706526e7ee06a362e38476dfdf2ddbad625c19b67ed30411d1bb25a18"},
|
||||||
{file = "grpcio-1.40.0-cp35-cp35m-win_amd64.whl", hash = "sha256:e12d776a240fee3ebd002519c02d165d94ec636d3fe3d6185b361bfc9a2d3106"},
|
{file = "grpcio-1.41.0-cp310-cp310-win_amd64.whl", hash = "sha256:fb64abf0d92134cb0ba4496a3b7ab918588eee42de20e5b3507fe6ee16db97ee"},
|
||||||
{file = "grpcio-1.40.0-cp36-cp36m-linux_armv7l.whl", hash = "sha256:6b9b432f5665dfc802187384693b6338f05c7fc3707ebf003a89bd5132074e27"},
|
{file = "grpcio-1.41.0-cp36-cp36m-linux_armv7l.whl", hash = "sha256:b6b68c444abbaf4a2b944a61cf35726ab9645f45d416bcc7cf4addc4b2f2d53d"},
|
||||||
{file = "grpcio-1.40.0-cp36-cp36m-macosx_10_10_x86_64.whl", hash = "sha256:886d056f5101ac513f4aefe4d21a816d98ee3f9a8e77fc3bcb4ae1a3a24efe26"},
|
{file = "grpcio-1.41.0-cp36-cp36m-macosx_10_10_x86_64.whl", hash = "sha256:5292a627b44b6d3065de4a364ead23bab3c9d7a7c05416a9de0c0624d0fe03f4"},
|
||||||
{file = "grpcio-1.40.0-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:b1b34e5a6f1285d1576099c663dae28c07b474015ed21e35a243aff66a0c2aed"},
|
{file = "grpcio-1.41.0-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:1820845e7e6410240eff97742e9f76cd5bf10ca01d36a322e86c0bd5340ac25b"},
|
||||||
{file = "grpcio-1.40.0-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:17ed13d43450ef9d1f9b78cc932bcf42844ca302235b93026dfd07fb5208d146"},
|
{file = "grpcio-1.41.0-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:462178987f0e5c60d6d1b79e4e95803a4cd789db961d6b3f087245906bb5ae04"},
|
||||||
{file = "grpcio-1.40.0-cp36-cp36m-manylinux2014_i686.whl", hash = "sha256:e19de138199502d575fcec5cf68ae48815a6efe7e5c0d0b8c97eba8c77ae9f0e"},
|
{file = "grpcio-1.41.0-cp36-cp36m-manylinux_2_17_aarch64.whl", hash = "sha256:7b07cbbd4eea56738e995fcbba3b60e41fd9aa9dac937fb7985c5dcbc7626260"},
|
||||||
{file = "grpcio-1.40.0-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:a812164ceb48cb62c3217bd6245274e693c624cc2ac0c1b11b4cea96dab054dd"},
|
{file = "grpcio-1.41.0-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3a92e4df5330cd384984e04804104ae34f521345917813aa86fc0930101a3697"},
|
||||||
{file = "grpcio-1.40.0-cp36-cp36m-manylinux_2_24_aarch64.whl", hash = "sha256:eedc8c3514c10b6f11c6f406877e424ca29610883b97bb97e33b1dd2a9077f6c"},
|
{file = "grpcio-1.41.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccd2f1cf11768d1f6fbe4e13e8b8fb0ccfe9914ceeff55a367d5571e82eeb543"},
|
||||||
{file = "grpcio-1.40.0-cp36-cp36m-win32.whl", hash = "sha256:1708a0ba90c798b4313f541ffbcc25ed47e790adaafb02111204362723dabef0"},
|
{file = "grpcio-1.41.0-cp36-cp36m-win32.whl", hash = "sha256:59645b2d9f19b5ff30cb46ddbcaa09c398f9cd81e4e476b21c7c55ae1e942807"},
|
||||||
{file = "grpcio-1.40.0-cp36-cp36m-win_amd64.whl", hash = "sha256:d760a66c9773780837915be85a39d2cd4ab42ef32657c5f1d28475e23ab709fc"},
|
{file = "grpcio-1.41.0-cp36-cp36m-win_amd64.whl", hash = "sha256:0abd56d90dff3ed566807520de1385126dded21e62d3490a34c180a91f94c1f4"},
|
||||||
{file = "grpcio-1.40.0-cp37-cp37m-linux_armv7l.whl", hash = "sha256:8a35b5f87247c893b01abf2f4f7493a18c2c5bf8eb3923b8dd1654d8377aa1a7"},
|
{file = "grpcio-1.41.0-cp37-cp37m-linux_armv7l.whl", hash = "sha256:9674a9d3f23702e35a89e22504f41b467893cf704f627cc9cdd118cf1dcc8e26"},
|
||||||
{file = "grpcio-1.40.0-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:45704b9b5b85f9bcb027f90f2563d11d995c1b870a9ee4b3766f6c7ff6fc3505"},
|
{file = "grpcio-1.41.0-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:c95dd6e60e059ff770a2ac9f5a202b75dd64d76b0cd0c48f27d58907e43ed6a6"},
|
||||||
{file = "grpcio-1.40.0-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:4967949071c9e435f9565ec2f49700cebeda54836a04710fe21f7be028c0125a"},
|
{file = "grpcio-1.41.0-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:a3cd7f945d3e3b82ebd2a4c9862eb9891a5ac87f84a7db336acbeafd86e6c402"},
|
||||||
{file = "grpcio-1.40.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:1f9ccc9f5c0d5084d1cd917a0b5ff0142a8d269d0755592d751f8ce9e7d3d7f1"},
|
{file = "grpcio-1.41.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:c07acd49541f5f6f9984fe0adf162d77bf70e0f58e77f9960c6f571314ff63a4"},
|
||||||
{file = "grpcio-1.40.0-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:5729ca9540049f52c2e608ca110048cfabab3aeaa0d9f425361d9f8ba8506cac"},
|
{file = "grpcio-1.41.0-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:7da3f6f6b857399c9ad85bcbffc83189e547a0a1a777ab68f5385154f8bc1ed4"},
|
||||||
{file = "grpcio-1.40.0-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:edddc849bed3c5dfe215a9f9532a9bd9f670b57d7b8af603be80148b4c69e9a8"},
|
{file = "grpcio-1.41.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:39ce785f0cbd07966a9019386b7a054615b2da63da3c7727f371304d000a1890"},
|
||||||
{file = "grpcio-1.40.0-cp37-cp37m-manylinux_2_24_aarch64.whl", hash = "sha256:49155dfdf725c0862c428039123066b25ce61bd38ce50a21ce325f1735aac1bd"},
|
{file = "grpcio-1.41.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07594e585a5ba25cf331ddb63095ca51010c34e328a822cb772ffbd5daa62cb5"},
|
||||||
{file = "grpcio-1.40.0-cp37-cp37m-win32.whl", hash = "sha256:913916823efa2e487b2ee9735b7759801d97fd1974bacdb1900e3bbd17f7d508"},
|
{file = "grpcio-1.41.0-cp37-cp37m-win32.whl", hash = "sha256:3bbeee115b05b22f6a9fa9bc78f9ab8d9d6bb8c16fdfc60401fc8658beae1099"},
|
||||||
{file = "grpcio-1.40.0-cp37-cp37m-win_amd64.whl", hash = "sha256:24277aab99c346ca36a1aa8589a0624e19a8e6f2b74c83f538f7bb1cc5ee8dbc"},
|
{file = "grpcio-1.41.0-cp37-cp37m-win_amd64.whl", hash = "sha256:dcb5f324712a104aca4a459e524e535f205f36deb8005feb4f9d3ff0a22b5177"},
|
||||||
{file = "grpcio-1.40.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:a66a30513d2e080790244a7ac3d7a3f45001f936c5c2c9613e41e2a5d7a11794"},
|
{file = "grpcio-1.41.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:83c1e731c2b76f26689ad88534cafefe105dcf385567bead08f5857cb308246b"},
|
||||||
{file = "grpcio-1.40.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:e2367f2b18dd4ba64cdcd9f626a920f9ec2e8228630839dc8f4a424d461137ea"},
|
{file = "grpcio-1.41.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:5d4b30d068b022e412adcf9b14c0d9bcbc872e9745b91467edc0a4c700a8bba6"},
|
||||||
{file = "grpcio-1.40.0-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:27dee6dcd1c04c4e9ceea49f6143003569292209d2c24ca100166660805e2440"},
|
{file = "grpcio-1.41.0-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:d71aa430b2ac40e18e388504ac34cc91d49d811855ca507c463a21059bf364f0"},
|
||||||
{file = "grpcio-1.40.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:d271e52038dec0db7c39ad9303442d6087c55e09b900e2931b86e837cf0cbc2e"},
|
{file = "grpcio-1.41.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:c8c5bc498f6506b6041c30afb7a55c57a9fd535d1a0ac7cdba9b5fd791a85633"},
|
||||||
{file = "grpcio-1.40.0-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:41e250ec7cd7523bf49c815b5509d5821728c26fac33681d4b0d1f5f34f59f06"},
|
{file = "grpcio-1.41.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:a144f6cecbb61aace12e5920840338a3d246123a41d795e316e2792e9775ad15"},
|
||||||
{file = "grpcio-1.40.0-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:33dc4259fecb96e6eac20f760656b911bcb1616aa3e58b3a1d2f125714a2f5d3"},
|
{file = "grpcio-1.41.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e516124010ef60d5fc2e0de0f1f987599249dc55fd529001f17f776a4145767f"},
|
||||||
{file = "grpcio-1.40.0-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:72b7b8075ee822dad4b39c150d73674c1398503d389e38981e9e35a894c476de"},
|
{file = "grpcio-1.41.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1e0a4c86d4cbd93059d5eeceed6e1c2e3e1494e1bf40be9b8ab14302c576162"},
|
||||||
{file = "grpcio-1.40.0-cp38-cp38-win32.whl", hash = "sha256:a93490e6eff5fce3748fb2757cb4273dc21eb1b56732b8c9640fd82c1997b215"},
|
{file = "grpcio-1.41.0-cp38-cp38-win32.whl", hash = "sha256:a614224719579044bd7950554d3b4c1793bb5715cbf0f0399b1f21d283c40ef6"},
|
||||||
{file = "grpcio-1.40.0-cp38-cp38-win_amd64.whl", hash = "sha256:d3b4b41eb0148fca3e6e6fc61d1332a7e8e7c4074fb0d1543f0b255d7f5f1588"},
|
{file = "grpcio-1.41.0-cp38-cp38-win_amd64.whl", hash = "sha256:b2de4e7b5a930be04a4d05c9f5fce7e9191217ccdc174b026c2a7928770dca9f"},
|
||||||
{file = "grpcio-1.40.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:fbe3b66bfa2c2f94535f6063f6db62b5b150d55a120f2f9e1175d3087429c4d9"},
|
{file = "grpcio-1.41.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:056806e83eaa09d0af0e452dd353db8f7c90aa2dedcce1112a2d21592550f6b1"},
|
||||||
{file = "grpcio-1.40.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:ecfd80e8ea03c46b3ea7ed37d2040fcbfe739004b9e4329b8b602d06ac6fb113"},
|
{file = "grpcio-1.41.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:5502832b7cec670a880764f51a335a19b10ff5ab2e940e1ded67f39b88aa02b1"},
|
||||||
{file = "grpcio-1.40.0-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:d487b4daf84a14741ca1dc1c061ffb11df49d13702cd169b5837fafb5e84d9c0"},
|
{file = "grpcio-1.41.0-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:585847ed190ea9cb4d632eb0ebf58f1d299bbca5e03284bc3d0fa08bab6ea365"},
|
||||||
{file = "grpcio-1.40.0-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:c26de909cfd54bacdb7e68532a1591a128486af47ee3a5f828df9aa2165ae457"},
|
{file = "grpcio-1.41.0-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:d0cc0393744ce3ce1b237ae773635cc928470ff46fb0d3f677e337a38e5ed4f6"},
|
||||||
{file = "grpcio-1.40.0-cp39-cp39-manylinux2014_i686.whl", hash = "sha256:1d9eabe2eb2f78208f9ae67a591f73b024488449d4e0a5b27c7fca2d6901a2d4"},
|
{file = "grpcio-1.41.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:2882b62f74de8c8a4f7b2be066f6230ecc46f4edc8f42db1fb7358200abe3b25"},
|
||||||
{file = "grpcio-1.40.0-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:4c2baa438f51152c9b7d0835ff711add0b4bc5056c0f5df581a6112153010696"},
|
{file = "grpcio-1.41.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:297ee755d3c6cd7e7d3770f298f4d4d4b000665943ae6d2888f7407418a9a510"},
|
||||||
{file = "grpcio-1.40.0-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:bf114be0023b145f7101f392a344692c1efd6de38a610c54a65ed3cba035e669"},
|
{file = "grpcio-1.41.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ace080a9c3c673c42adfd2116875a63fec9613797be01a6105acf7721ed0c693"},
|
||||||
{file = "grpcio-1.40.0-cp39-cp39-win32.whl", hash = "sha256:5f6d6b638698fa6decf7f040819aade677b583eaa21b43366232cb254a2bbac8"},
|
{file = "grpcio-1.41.0-cp39-cp39-win32.whl", hash = "sha256:1bcbeac764bbae329bc2cc9e95d0f4d3b0fb456b92cf12e7e06e3e860a4b31cf"},
|
||||||
{file = "grpcio-1.40.0-cp39-cp39-win_amd64.whl", hash = "sha256:005fe14e67291498989da67d454d805be31d57a988af28ed3a2a0a7cabb05c53"},
|
{file = "grpcio-1.41.0-cp39-cp39-win_amd64.whl", hash = "sha256:4537bb9e35af62c5189493792a8c34d127275a6d175c8ad48b6314cacba4021e"},
|
||||||
{file = "grpcio-1.40.0.tar.gz", hash = "sha256:3d172158fe886a2604db1b6e17c2de2ab465fe0fe36aba2ec810ca8441cefe3a"},
|
{file = "grpcio-1.41.0.tar.gz", hash = "sha256:15c04d695833c739dbb25c88eaf6abd9a461ec0dbd32f44bc8769335a495cf5a"},
|
||||||
]
|
]
|
||||||
grpcio-gcp = [
|
grpcio-gcp = [
|
||||||
{file = "grpcio-gcp-0.2.2.tar.gz", hash = "sha256:e292605effc7da39b7a8734c719afb12ec4b5362add3528d8afad3aa3aa9057c"},
|
{file = "grpcio-gcp-0.2.2.tar.gz", hash = "sha256:e292605effc7da39b7a8734c719afb12ec4b5362add3528d8afad3aa3aa9057c"},
|
||||||
@@ -2153,8 +2154,8 @@ pillow = [
|
|||||||
{file = "Pillow-8.3.2.tar.gz", hash = "sha256:dde3f3ed8d00c72631bc19cbfff8ad3b6215062a5eed402381ad365f82f0c18c"},
|
{file = "Pillow-8.3.2.tar.gz", hash = "sha256:dde3f3ed8d00c72631bc19cbfff8ad3b6215062a5eed402381ad365f82f0c18c"},
|
||||||
]
|
]
|
||||||
platformdirs = [
|
platformdirs = [
|
||||||
{file = "platformdirs-2.3.0-py3-none-any.whl", hash = "sha256:8003ac87717ae2c7ee1ea5a84a1a61e87f3fbd16eb5aadba194ea30a9019f648"},
|
{file = "platformdirs-2.4.0-py3-none-any.whl", hash = "sha256:8868bbe3c3c80d42f20156f22e7131d2fb321f5bc86a2a345375c6481a67021d"},
|
||||||
{file = "platformdirs-2.3.0.tar.gz", hash = "sha256:15b056538719b1c94bdaccb29e5f81879c7f7f0f4a153f46086d155dffcd4f0f"},
|
{file = "platformdirs-2.4.0.tar.gz", hash = "sha256:367a5e80b3d04d2428ffa76d33f124cf11e8fff2acdaa9b43d545f5c7d661ef2"},
|
||||||
]
|
]
|
||||||
prospector = [
|
prospector = [
|
||||||
{file = "prospector-1.5.1-py3-none-any.whl", hash = "sha256:47f8ff3fd36ae276967eb392ca20b300a7bdea66c0d0252250a4d89a6c03ab15"},
|
{file = "prospector-1.5.1-py3-none-any.whl", hash = "sha256:47f8ff3fd36ae276967eb392ca20b300a7bdea66c0d0252250a4d89a6c03ab15"},
|
||||||
|
|||||||
@@ -20,6 +20,8 @@ pylint:
|
|||||||
- super-init-not-called
|
- super-init-not-called
|
||||||
- arguments-differ
|
- arguments-differ
|
||||||
- inconsistent-return-statements
|
- inconsistent-return-statements
|
||||||
|
- expression-not-assigned
|
||||||
|
- line-too-long
|
||||||
enable:
|
enable:
|
||||||
|
|
||||||
options:
|
options:
|
||||||
|
|||||||
@@ -7,14 +7,17 @@ authors = ["Daniel Tomlinson <dtomlinson@panaetius.co.uk>"]
|
|||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = "^3.7"
|
python = "^3.7"
|
||||||
apache-beam = {extras = ["gcp"], version = "^2.32.0"}
|
apache-beam = {extras = ["gcp"], version = "^2.32.0"}
|
||||||
mkdocs = "^1.2.2"
|
|
||||||
mkdocs-material = "^7.3.0"
|
|
||||||
|
|
||||||
[tool.poetry.dev-dependencies]
|
[tool.poetry.dev-dependencies]
|
||||||
# pytest = "^5.2"
|
# pytest = "^5.2"
|
||||||
prospector = "^1.5.1"
|
prospector = "^1.5.1"
|
||||||
pandas-profiling = "^3.0.0"
|
pandas-profiling = "^3.0.0"
|
||||||
|
mkdocs = "^1.2.2"
|
||||||
|
mkdocs-material = "^7.3.0"
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["poetry-core>=1.0.0"]
|
requires = ["poetry-core>=1.0.0"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
|
[tool.poetry.scripts]
|
||||||
|
"analyse-properties" = "analyse_properties.main:run"
|
||||||
|
|||||||
@@ -1,6 +0,0 @@
|
|||||||
apache-beam==2.32.0; python_version >= "3.6"
|
|
||||||
avro-python3==1.9.2.1; python_version >= "3.6"
|
|
||||||
cachetools==4.2.2; python_version >= "3.6" and python_version < "4.0" and (python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.6.0")
|
|
||||||
certifi==2021.5.30; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version >= "3.6"
|
|
||||||
mkdocs-material==7.3.0
|
|
||||||
mkdocs==1.2.2; python_version >= "3.6"
|
|
||||||
53
requirements.txt
Normal file
53
requirements.txt
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
apache-beam==2.32.0; python_version >= "3.6"
|
||||||
|
avro-python3==1.9.2.1; python_version >= "3.6"
|
||||||
|
cachetools==4.2.2; python_version >= "3.6" and python_version < "4.0" and (python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.6.0")
|
||||||
|
certifi==2021.5.30; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version >= "3.6"
|
||||||
|
charset-normalizer==2.0.6; python_full_version >= "3.6.0" and python_version >= "3.6"
|
||||||
|
crcmod==1.7; python_version >= "3.6"
|
||||||
|
dill==0.3.1.1; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.1.0" and python_version >= "3.6"
|
||||||
|
docopt==0.6.2; python_version >= "3.6"
|
||||||
|
fastavro==1.4.5; python_version >= "3.6"
|
||||||
|
fasteners==0.16.3; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.5.0"
|
||||||
|
future==0.18.2; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.3.0" and python_version >= "3.6"
|
||||||
|
google-api-core==1.31.2; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.6.0"
|
||||||
|
google-apitools==0.5.31; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.5.0"
|
||||||
|
google-auth==1.35.0; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.6.0"
|
||||||
|
google-cloud-bigquery==2.6.1; python_version >= "3.6"
|
||||||
|
google-cloud-bigtable==1.7.0; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.4.0"
|
||||||
|
google-cloud-core==1.7.2; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.6.0"
|
||||||
|
google-cloud-datastore==1.15.3; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.4.0"
|
||||||
|
google-cloud-dlp==1.0.0; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.4.0"
|
||||||
|
google-cloud-language==1.3.0; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.4.0"
|
||||||
|
google-cloud-pubsub==1.7.0; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.5.0"
|
||||||
|
google-cloud-recommendations-ai==0.2.0; python_version >= "3.6"
|
||||||
|
google-cloud-spanner==1.19.1; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.4.0"
|
||||||
|
google-cloud-videointelligence==1.16.1; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.4.0"
|
||||||
|
google-cloud-vision==1.0.0; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.4.0"
|
||||||
|
google-crc32c==1.2.0; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.6.0"
|
||||||
|
google-resumable-media==1.3.3; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.6.0"
|
||||||
|
googleapis-common-protos==1.53.0; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.6.0"
|
||||||
|
grpc-google-iam-v1==0.12.3; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.5.0"
|
||||||
|
grpcio-gcp==0.2.2; python_version >= "3.6"
|
||||||
|
grpcio==1.41.0; python_version >= "3.6" and (python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.4.0")
|
||||||
|
hdfs==2.6.0; python_version >= "3.6"
|
||||||
|
httplib2==0.19.1; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.5.0"
|
||||||
|
idna==3.2; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version >= "3.6"
|
||||||
|
numpy==1.20.3; python_version >= "3.7"
|
||||||
|
oauth2client==4.1.3; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.5.0"
|
||||||
|
orjson==3.6.3; python_version >= "3.7"
|
||||||
|
packaging==21.0; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.6.0"
|
||||||
|
proto-plus==1.19.0; python_version >= "3.6"
|
||||||
|
protobuf==3.18.0; python_version >= "3.6" and (python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.6.0")
|
||||||
|
pyarrow==4.0.1; python_version >= "3.6"
|
||||||
|
pyasn1-modules==0.2.8; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.6.0"
|
||||||
|
pyasn1==0.4.8; python_version >= "3.6" and python_full_version < "3.0.0" and python_version < "4" and (python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.6.0") or python_version >= "3.6" and python_full_version >= "3.6.0" and python_version < "4" and (python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.6.0")
|
||||||
|
pydot==1.4.2; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6"
|
||||||
|
pymongo==3.12.0; python_version >= "3.6"
|
||||||
|
pyparsing==2.4.7; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version >= "3.6"
|
||||||
|
python-dateutil==2.8.2; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.3.0" and python_version >= "3.6"
|
||||||
|
pytz==2021.1; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.6.0"
|
||||||
|
requests==2.26.0; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version >= "3.6"
|
||||||
|
rsa==4.7.2; python_version >= "3.6" and python_version < "4" and (python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.6.0")
|
||||||
|
six==1.16.0; python_version >= "3.6" and python_full_version < "3.0.0" or python_version >= "3.6" and python_full_version >= "3.6.0"
|
||||||
|
typing-extensions==3.7.4.3; python_version >= "3.6"
|
||||||
|
urllib3==1.26.7; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version < "4" and python_version >= "3.6"
|
||||||
Reference in New Issue
Block a user