mirror of
https://github.com/dtomlinson91/street_group_tech_test
synced 2025-12-22 03:55:43 +00:00
* adding initial skeleton * updating .gitignore * updating dev dependencies * adding report.py * updating notes * adding prospector.yaml * updating beam to install gcp extras * adding documentation * adding data exploration report + code * adding latest beam pipeline code * adding latest beam pipeline code * adding debug.py * adding latesty beam pipeline code * adding latest beam pipeline code * adding latest beam pipeline code * updating .gitignore * updating folder structure for data input/output * updating prospector.yaml * adding latest beam pipeline code * updating prospector.yaml * migrate beam pipeline to main.py * updating .gitignore * updating .gitignore * adding download script for data set * adding initial docs * moving inputs/outputs to use pathlib * removing shard_name_template from output file * adding pyenv 3.7.9 * removing requirements.txt for documentation * updating README.md * updating download data script for new location in GCS * adding latest beam pipeline code for dataflow * adding latest beam pipeline code for dataflow * adding latest beam pipeline code for dataflow * moving dataflow notes * updating prospector.yaml * adding latest beam pipeline code for dataflow * updating beam pipeline to use GroupByKey * updating download_data script with new bucket * update prospector.yaml * update dataflow documentation with new commands for vpc * adding latest beam pipeline code for dataflow with group optimisation * updating dataflow documentation * adding latest beam pipeline code for dataflow with group optimisation * updating download_data script with pp-2020 dataset * adding temporary notes * updating dataflow notes * adding latest beam pipeline code * updating dataflow notes * adding latest beam pipeline code for dataflow * adding debug print * moving panda-profiling report into docs * updating report.py * adding entrypoint command * adding initial docs * adding commands.md to notes * commenting out debug imports * updating documentation * updating latest beam pipeline with default inputs * updating poetry * adding requirements.txt * updating documentation
39 lines
959 B
Python
39 lines
959 B
Python
import pathlib
|
|
|
|
import pandas as pd
|
|
from pandas_profiling import ProfileReport
|
|
|
|
|
|
def main():
|
|
input_file = (
|
|
pathlib.Path(__file__).parents[1] / "data" / "input" / "pp-complete.csv"
|
|
)
|
|
with input_file.open() as csv:
|
|
df_report = pd.read_csv(
|
|
csv,
|
|
names=[
|
|
"transaction_id",
|
|
"price",
|
|
"date_of_transfer",
|
|
"postcode",
|
|
"property_type",
|
|
"old_new",
|
|
"duration",
|
|
"paon",
|
|
"saon",
|
|
"street",
|
|
"locality",
|
|
"town_city",
|
|
"district",
|
|
"county",
|
|
"ppd_category",
|
|
"record_status",
|
|
],
|
|
)
|
|
profile = ProfileReport(df_report, title="Price Paid Data", minimal=True)
|
|
profile.to_file("price_paid_data_report.html")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|