mirror of
https://github.com/dtomlinson91/street_group_tech_test
synced 2025-12-22 11:55:45 +00:00
Merge final release (#1)
* adding initial skeleton * updating .gitignore * updating dev dependencies * adding report.py * updating notes * adding prospector.yaml * updating beam to install gcp extras * adding documentation * adding data exploration report + code * adding latest beam pipeline code * adding latest beam pipeline code * adding debug.py * adding latesty beam pipeline code * adding latest beam pipeline code * adding latest beam pipeline code * updating .gitignore * updating folder structure for data input/output * updating prospector.yaml * adding latest beam pipeline code * updating prospector.yaml * migrate beam pipeline to main.py * updating .gitignore * updating .gitignore * adding download script for data set * adding initial docs * moving inputs/outputs to use pathlib * removing shard_name_template from output file * adding pyenv 3.7.9 * removing requirements.txt for documentation * updating README.md * updating download data script for new location in GCS * adding latest beam pipeline code for dataflow * adding latest beam pipeline code for dataflow * adding latest beam pipeline code for dataflow * moving dataflow notes * updating prospector.yaml * adding latest beam pipeline code for dataflow * updating beam pipeline to use GroupByKey * updating download_data script with new bucket * update prospector.yaml * update dataflow documentation with new commands for vpc * adding latest beam pipeline code for dataflow with group optimisation * updating dataflow documentation * adding latest beam pipeline code for dataflow with group optimisation * updating download_data script with pp-2020 dataset * adding temporary notes * updating dataflow notes * adding latest beam pipeline code * updating dataflow notes * adding latest beam pipeline code for dataflow * adding debug print * moving panda-profiling report into docs * updating report.py * adding entrypoint command * adding initial docs * adding commands.md to notes * commenting out debug imports * updating documentation * updating latest beam pipeline with default inputs * updating poetry * adding requirements.txt * updating documentation
This commit is contained in:
27
notes/tmp/errordata
Normal file
27
notes/tmp/errordata
Normal file
@@ -0,0 +1,27 @@
|
||||
"Error message from worker: Traceback (most recent call last):
|
||||
File "/usr/local/lib/python3.7/site-packages/dataflow_worker/batchworker.py", line 651, in do_work
|
||||
work_executor.execute()
|
||||
File "/usr/local/lib/python3.7/site-packages/dataflow_worker/executor.py", line 181, in execute
|
||||
op.finish()
|
||||
File "dataflow_worker/native_operations.py", line 93, in dataflow_worker.native_operations.NativeWriteOperation.finish
|
||||
File "dataflow_worker/native_operations.py", line 94, in dataflow_worker.native_operations.NativeWriteOperation.finish
|
||||
File "dataflow_worker/native_operations.py", line 95, in dataflow_worker.native_operations.NativeWriteOperation.finish
|
||||
File "/usr/local/lib/python3.7/site-packages/dataflow_worker/nativeavroio.py", line 308, in __exit__
|
||||
self._data_file_writer.flush()
|
||||
File "fastavro/_write.pyx", line 664, in fastavro._write.Writer.flush
|
||||
File "fastavro/_write.pyx", line 639, in fastavro._write.Writer.dump
|
||||
File "fastavro/_write.pyx", line 451, in fastavro._write.snappy_write_block
|
||||
File "fastavro/_write.pyx", line 458, in fastavro._write.snappy_write_block
|
||||
File "/usr/local/lib/python3.7/site-packages/apache_beam/io/filesystemio.py", line 200, in write
|
||||
self._uploader.put(b)
|
||||
File "/usr/local/lib/python3.7/site-packages/apache_beam/io/gcp/gcsio.py", line 720, in put
|
||||
self._conn.send_bytes(data.tobytes())
|
||||
File "/usr/local/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
|
||||
self._send_bytes(m[offset:offset + size])
|
||||
File "/usr/local/lib/python3.7/multiprocessing/connection.py", line 393, in _send_bytes
|
||||
header = struct.pack("!i", n)
|
||||
struct.error: 'i' format requires -2147483648 <= number <= 2147483647
|
||||
"
|
||||
|
||||
|
||||
"Out of memory: Killed process 2042 (python) total-vm:28616496kB, anon-rss:25684136kB, file-rss:0kB, shmem-rss:0kB, UID:0 pgtables:51284kB oom_score_adj:900"
|
||||
44
notes/tmp/exampledata
Normal file
44
notes/tmp/exampledata
Normal file
@@ -0,0 +1,44 @@
|
||||
[{
|
||||
"property_id": "3cf3c06632c46754696f2017933702f3",
|
||||
"flat_appartment": "",
|
||||
"builing": "",
|
||||
"number": "63",
|
||||
"street": "ROTTON PARK STREET",
|
||||
"locality": "",
|
||||
"town": "BIRMINGHAM",
|
||||
"district": "BIRMINGHAM",
|
||||
"county": "WEST MIDLANDS",
|
||||
"postcode": "B16 0AE",
|
||||
"property_transactions": [
|
||||
{ "price": "385000", "transaction_date": "2021-01-08", "year": "2021" },
|
||||
{ "price": "701985", "transaction_date": "2019-03-28", "year": "2019" },
|
||||
{ "price": "1748761", "transaction_date": "2020-05-27", "year": "2020" }
|
||||
],
|
||||
"latest_transaction_year": "2021"
|
||||
},
|
||||
{
|
||||
"property_id": "c650d5d7bb0daf0a19bb2cacabbee74e",
|
||||
"readable_address": "16 STATION ROAD\nPARKGATE\nNESTON\nCHESHIRE WEST AND CHESTER\nCH64 6QJ",
|
||||
"flat_appartment": "",
|
||||
"builing": "",
|
||||
"number": "16",
|
||||
"street": "STATION ROAD",
|
||||
"locality": "PARKGATE",
|
||||
"town": "NESTON",
|
||||
"district": "CHESHIRE WEST AND CHESTER",
|
||||
"county": "CHESHIRE WEST AND CHESTER",
|
||||
"postcode": "CH64 6QJ",
|
||||
"property_transactions": [
|
||||
{
|
||||
"price": "280000",
|
||||
"transaction_date": "2020-11-30",
|
||||
"year": "2020"
|
||||
},
|
||||
{
|
||||
"price": "265000",
|
||||
"transaction_date": "2020-05-29",
|
||||
"year": "2020"
|
||||
}
|
||||
],
|
||||
"latest_transaction_year": "2020"
|
||||
}]
|
||||
16
notes/tmp/runningdata
Normal file
16
notes/tmp/runningdata
Normal file
@@ -0,0 +1,16 @@
|
||||
|
||||
|
||||
Create Mapping table
|
||||
('fd4634faec47c29de40bbf7840723b41', ['317500', '2020-11-13 00:00', 'B90 3LA', '1', '', 'VERSTONE ROAD', 'SHIRLEY', 'SOLIHULL', 'SOLIHULL', 'WEST MIDLANDS', ''])
|
||||
('fd4634faec47c29de40bbf7840723b41', ['317500', '2020-11-13 00:00', 'B90 3LA', '1', '', 'VERSTONE ROAD', 'SHIRLEY', 'SOLIHULL', 'SOLIHULL', 'WEST MIDLANDS', ''])
|
||||
|
||||
Condensing
|
||||
{'fd4634faec47c29de40bbf7840723b41': ['317500', '2020-11-13 00:00', 'B90 3LA', '1', '', 'VERSTONE ROAD', 'SHIRLEY', 'SOLIHULL', 'SOLIHULL', 'WEST MIDLANDS', '']}
|
||||
|
||||
|
||||
Prepared
|
||||
GroupByKey
|
||||
('fe205bfe66bc7f18c50c8f3d77ec3e30', ['fd4634faec47c29de40bbf7840723b41', 'fd4634faec47c29de40bbf7840723b41'])
|
||||
|
||||
deduplicated
|
||||
('fe205bfe66bc7f18c50c8f3d77ec3e30', ['fd4634faec47c29de40bbf7840723b41'])
|
||||
Reference in New Issue
Block a user