mirror of
https://github.com/dtomlinson91/street_group_tech_test
synced 2025-12-22 11:55:45 +00:00
Deployed a53d791 with MkDocs version: 1.2.2
This commit is contained in:
120
404.html
120
404.html
@@ -200,6 +200,22 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="/dataflow/index.html" class="md-tabs__link">
|
||||||
|
DataFlow
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-tabs__item">
|
<li class="md-tabs__item">
|
||||||
<a href="/pandas-profiling/report.html" class="md-tabs__link">
|
<a href="/pandas-profiling/report.html" class="md-tabs__link">
|
||||||
Data Exploration Report
|
Data Exploration Report
|
||||||
@@ -384,6 +400,110 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="/discussion/cleaning.html" class="md-nav__link">
|
||||||
|
Cleaning
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="/discussion/approach.html" class="md-nav__link">
|
||||||
|
Approach
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="/discussion/results.html" class="md-nav__link">
|
||||||
|
Results
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--nested">
|
||||||
|
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_3" type="checkbox" id="__nav_3" >
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link" for="__nav_3">
|
||||||
|
DataFlow
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="DataFlow" data-md-level="1">
|
||||||
|
<label class="md-nav__title" for="__nav_3">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
DataFlow
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="/dataflow/index.html" class="md-nav__link">
|
||||||
|
Running on DataFlow
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="/dataflow/scaling.html" class="md-nav__link">
|
||||||
|
Scaling to the Full DataSet
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
</li>
|
</li>
|
||||||
|
|||||||
@@ -0,0 +1,3 @@
|
|||||||
|
[ZoneTransfer]
|
||||||
|
ZoneId=3
|
||||||
|
HostUrl=about:internet
|
||||||
BIN
dataflow/img/successful_dataflow_job.png
Normal file
BIN
dataflow/img/successful_dataflow_job.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 374 KiB |
803
dataflow/index.html
Normal file
803
dataflow/index.html
Normal file
@@ -0,0 +1,803 @@
|
|||||||
|
|
||||||
|
<!doctype html>
|
||||||
|
<html lang="en" class="no-js">
|
||||||
|
<head>
|
||||||
|
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<link rel="icon" href="../assets/images/favicon.png">
|
||||||
|
<meta name="generator" content="mkdocs-1.2.2, mkdocs-material-7.3.0">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<title>Running on DataFlow - The Street Group Technical Test</title>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<link rel="stylesheet" href="../assets/stylesheets/main.8b42a75e.min.css">
|
||||||
|
|
||||||
|
|
||||||
|
<link rel="stylesheet" href="../assets/stylesheets/palette.3f5d1f46.min.css">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<meta name="theme-color" content="#4051b5">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||||
|
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,400,400i,700%7CRoboto+Mono&display=fallback">
|
||||||
|
<style>:root{--md-text-font-family:"Roboto";--md-code-font-family:"Roboto Mono"}</style>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</head>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<body dir="ltr" data-md-color-scheme="" data-md-color-primary="indigo" data-md-color-accent="blue">
|
||||||
|
|
||||||
|
|
||||||
|
<script>function __prefix(e){return new URL("..",location).pathname+"."+e}function __get(e,t=localStorage){return JSON.parse(t.getItem(__prefix(e)))}</script>
|
||||||
|
|
||||||
|
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
||||||
|
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
||||||
|
<label class="md-overlay" for="__drawer"></label>
|
||||||
|
<div data-md-component="skip">
|
||||||
|
|
||||||
|
|
||||||
|
<a href="#running-on-dataflow" class="md-skip">
|
||||||
|
Skip to content
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
<div data-md-component="announce">
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<header class="md-header" data-md-component="header">
|
||||||
|
<nav class="md-header__inner md-grid" aria-label="Header">
|
||||||
|
<a href=".." title="The Street Group Technical Test" class="md-header__button md-logo" aria-label="The Street Group Technical Test" data-md-component="logo">
|
||||||
|
|
||||||
|
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54z"/></svg>
|
||||||
|
|
||||||
|
</a>
|
||||||
|
<label class="md-header__button md-icon" for="__drawer">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2z"/></svg>
|
||||||
|
</label>
|
||||||
|
<div class="md-header__title" data-md-component="header-title">
|
||||||
|
<div class="md-header__ellipsis">
|
||||||
|
<div class="md-header__topic">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
The Street Group Technical Test
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div class="md-header__topic" data-md-component="header-topic">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Running on DataFlow
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-header__button md-icon" for="__search">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5z"/></svg>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<div class="md-search" data-md-component="search" role="dialog">
|
||||||
|
<label class="md-search__overlay" for="__search"></label>
|
||||||
|
<div class="md-search__inner" role="search">
|
||||||
|
<form class="md-search__form" name="search">
|
||||||
|
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
|
||||||
|
<label class="md-search__icon md-icon" for="__search">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5z"/></svg>
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12z"/></svg>
|
||||||
|
</label>
|
||||||
|
<nav class="md-search__options" aria-label="Search">
|
||||||
|
|
||||||
|
<button type="reset" class="md-search__icon md-icon" aria-label="Clear" tabindex="-1">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41z"/></svg>
|
||||||
|
</button>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</form>
|
||||||
|
<div class="md-search__output">
|
||||||
|
<div class="md-search__scrollwrap" data-md-scrollfix>
|
||||||
|
<div class="md-search-result" data-md-component="search-result">
|
||||||
|
<div class="md-search-result__meta">
|
||||||
|
Initializing search
|
||||||
|
</div>
|
||||||
|
<ol class="md-search-result__list"></ol>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
<div class="md-header__source">
|
||||||
|
|
||||||
|
<a href="https://github.com/dtomlinson91/street_group_tech_test/" title="Go to repository" class="md-source" data-md-component="source">
|
||||||
|
<div class="md-source__icon md-icon">
|
||||||
|
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
|
||||||
|
</div>
|
||||||
|
<div class="md-source__repository">
|
||||||
|
GitHub
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<div class="md-container" data-md-component="container">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<nav class="md-tabs" aria-label="Tabs" data-md-component="tabs">
|
||||||
|
<div class="md-tabs__inner md-grid">
|
||||||
|
<ul class="md-tabs__list">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="../index.html" class="md-tabs__link">
|
||||||
|
Documentation
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="../discussion/introduction.html" class="md-tabs__link">
|
||||||
|
Discussion
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="index.html" class="md-tabs__link md-tabs__link--active">
|
||||||
|
DataFlow
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="../pandas-profiling/report.html" class="md-tabs__link">
|
||||||
|
Data Exploration Report
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<main class="md-main" data-md-component="main">
|
||||||
|
<div class="md-main__inner md-grid">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
||||||
|
<div class="md-sidebar__scrollwrap">
|
||||||
|
<div class="md-sidebar__inner">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<nav class="md-nav md-nav--primary md-nav--lifted" aria-label="Navigation" data-md-level="0">
|
||||||
|
<label class="md-nav__title" for="__drawer">
|
||||||
|
<a href=".." title="The Street Group Technical Test" class="md-nav__button md-logo" aria-label="The Street Group Technical Test" data-md-component="logo">
|
||||||
|
|
||||||
|
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54z"/></svg>
|
||||||
|
|
||||||
|
</a>
|
||||||
|
The Street Group Technical Test
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<div class="md-nav__source">
|
||||||
|
|
||||||
|
<a href="https://github.com/dtomlinson91/street_group_tech_test/" title="Go to repository" class="md-source" data-md-component="source">
|
||||||
|
<div class="md-source__icon md-icon">
|
||||||
|
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
|
||||||
|
</div>
|
||||||
|
<div class="md-source__repository">
|
||||||
|
GitHub
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--nested">
|
||||||
|
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_1" type="checkbox" id="__nav_1" >
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link" for="__nav_1">
|
||||||
|
Documentation
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="Documentation" data-md-level="1">
|
||||||
|
<label class="md-nav__title" for="__nav_1">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Documentation
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../index.html" class="md-nav__link">
|
||||||
|
Welcome
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../documentation/installation.html" class="md-nav__link">
|
||||||
|
Installation
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../documentation/usage.html" class="md-nav__link">
|
||||||
|
Usage
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--nested">
|
||||||
|
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_2" type="checkbox" id="__nav_2" >
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link" for="__nav_2">
|
||||||
|
Discussion
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="Discussion" data-md-level="1">
|
||||||
|
<label class="md-nav__title" for="__nav_2">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Discussion
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../discussion/introduction.html" class="md-nav__link">
|
||||||
|
Introduction
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../discussion/exploration.html" class="md-nav__link">
|
||||||
|
Data Exploration Report
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../discussion/cleaning.html" class="md-nav__link">
|
||||||
|
Cleaning
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../discussion/approach.html" class="md-nav__link">
|
||||||
|
Approach
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../discussion/results.html" class="md-nav__link">
|
||||||
|
Results
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
||||||
|
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_3" type="checkbox" id="__nav_3" checked>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link" for="__nav_3">
|
||||||
|
DataFlow
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="DataFlow" data-md-level="1">
|
||||||
|
<label class="md-nav__title" for="__nav_3">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
DataFlow
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--active">
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="toc" type="checkbox" id="__toc">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||||||
|
Running on DataFlow
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<a href="index.html" class="md-nav__link md-nav__link--active">
|
||||||
|
Running on DataFlow
|
||||||
|
</a>
|
||||||
|
|
||||||
|
|
||||||
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__title" for="__toc">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Table of contents
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#prerequisites" class="md-nav__link">
|
||||||
|
Prerequisites
|
||||||
|
</a>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="Prerequisites">
|
||||||
|
<ul class="md-nav__list">
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#cloud-storage" class="md-nav__link">
|
||||||
|
Cloud Storage
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#vpc" class="md-nav__link">
|
||||||
|
VPC
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#command" class="md-nav__link">
|
||||||
|
Command
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="scaling.html" class="md-nav__link">
|
||||||
|
Scaling to the Full DataSet
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../pandas-profiling/report.html" class="md-nav__link">
|
||||||
|
Data Exploration Report
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
||||||
|
<div class="md-sidebar__scrollwrap">
|
||||||
|
<div class="md-sidebar__inner">
|
||||||
|
|
||||||
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__title" for="__toc">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Table of contents
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#prerequisites" class="md-nav__link">
|
||||||
|
Prerequisites
|
||||||
|
</a>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="Prerequisites">
|
||||||
|
<ul class="md-nav__list">
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#cloud-storage" class="md-nav__link">
|
||||||
|
Cloud Storage
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#vpc" class="md-nav__link">
|
||||||
|
VPC
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#command" class="md-nav__link">
|
||||||
|
Command
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
</nav>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
<div class="md-content" data-md-component="content">
|
||||||
|
<article class="md-content__inner md-typeset">
|
||||||
|
|
||||||
|
|
||||||
|
<a href="https://github.com/dtomlinson91/street_group_tech_test/edit/master/docs/dataflow/index.md" title="Edit this page" class="md-content__button md-icon">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25z"/></svg>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
|
||||||
|
<h1 id="running-on-dataflow">Running on DataFlow<a class="headerlink" href="#running-on-dataflow" title="Permanent link">¶</a></h1>
|
||||||
|
<p>The pipeline runs as is on GCP DataFlow. The following documents how I deployed to my personal GCP account but the approach may vary depending on project/account in GCP.</p>
|
||||||
|
<h2 id="prerequisites">Prerequisites<a class="headerlink" href="#prerequisites" title="Permanent link">¶</a></h2>
|
||||||
|
<h3 id="cloud-storage">Cloud Storage<a class="headerlink" href="#cloud-storage" title="Permanent link">¶</a></h3>
|
||||||
|
<ul>
|
||||||
|
<li>A Cloud Storage bucket with the following structure:</li>
|
||||||
|
</ul>
|
||||||
|
<div class="highlight"><pre><span></span><code>./input
|
||||||
|
./output
|
||||||
|
./tmp
|
||||||
|
</code></pre></div>
|
||||||
|
<ul>
|
||||||
|
<li>Place the input files into the <code>./input</code> directory in the bucket.</li>
|
||||||
|
</ul>
|
||||||
|
<h3 id="vpc">VPC<a class="headerlink" href="#vpc" title="Permanent link">¶</a></h3>
|
||||||
|
<p>To get around public IP quotas I created a VPC in the <code>europe-west1</code> region that has <code>Private Google Access</code> turned to <code>ON</code>.</p>
|
||||||
|
<h2 id="command">Command<a class="headerlink" href="#command" title="Permanent link">¶</a></h2>
|
||||||
|
<div class="admonition tip">
|
||||||
|
<p class="admonition-title">Tip</p>
|
||||||
|
<p>We need to choose a <code>worker_machine_type</code> with sufficient memory to run the pipeline. As the pipeline uses a mapping table, and DataFlow autoscales on CPU and not memory usage, we need a machine with more ram than usual to ensure sufficient memory when running on one worker. For <code>pp-2020.csv</code> the type <code>n1-highmem-2</code> with 2vCPU and 13GB of ram was chosen and completed successfully in ~10 minutes using only 1 worker.</p>
|
||||||
|
</div>
|
||||||
|
<p>Assuming the <code>pp-2020.csv</code> file has been placed in the <code>./input</code> directory in the bucket you can run a command similar to:</p>
|
||||||
|
<div class="admonition caution">
|
||||||
|
<p class="admonition-title">Caution</p>
|
||||||
|
<p>Use the command <code>python -m analyse_properties.main</code> as the entrypoint to the pipeline and not <code>analyse-properties</code> as the module isn't installed with poetry on the workers with the configuration below.</p>
|
||||||
|
</div>
|
||||||
|
<div class="highlight"><pre><span></span><code>python -m analyse_properties.main <span class="se">\</span>
|
||||||
|
--runner DataflowRunner <span class="se">\</span>
|
||||||
|
--project street-group <span class="se">\</span>
|
||||||
|
--region europe-west1 <span class="se">\</span>
|
||||||
|
--input gs://street-group-technical-test-dmot-euw1/input/pp-2020.csv <span class="se">\</span>
|
||||||
|
--output gs://street-group-technical-test-dmot-euw1/output/pp-2020 <span class="se">\</span>
|
||||||
|
--temp_location gs://street-group-technical-test-dmot-euw1/tmp <span class="se">\</span>
|
||||||
|
--subnetwork<span class="o">=</span>https://www.googleapis.com/compute/v1/projects/street-group/regions/europe-west1/subnetworks/europe-west-1-dataflow <span class="se">\</span>
|
||||||
|
--no_use_public_ips <span class="se">\</span>
|
||||||
|
--worker_machine_type<span class="o">=</span>n1-highmem-2
|
||||||
|
</code></pre></div>
|
||||||
|
<p>The output file from this pipeline is publically available and can be downloaded <a href="https://storage.googleapis.com/street-group-technical-test-dmot-euw1/output/pp-2020-00000-of-00001.json">here</a>.</p>
|
||||||
|
<p>The job graph for this pipeline is displayed below:</p>
|
||||||
|
<p><img alt="JobGraph" src="img/successful_dataflow_job.png" /></p>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</article>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</main>
|
||||||
|
|
||||||
|
|
||||||
|
<footer class="md-footer">
|
||||||
|
|
||||||
|
<nav class="md-footer__inner md-grid" aria-label="Footer">
|
||||||
|
|
||||||
|
|
||||||
|
<a href="../discussion/results.html" class="md-footer__link md-footer__link--prev" aria-label="Previous: Results" rel="prev">
|
||||||
|
<div class="md-footer__button md-icon">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12z"/></svg>
|
||||||
|
</div>
|
||||||
|
<div class="md-footer__title">
|
||||||
|
<div class="md-ellipsis">
|
||||||
|
<span class="md-footer__direction">
|
||||||
|
Previous
|
||||||
|
</span>
|
||||||
|
Results
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<a href="scaling.html" class="md-footer__link md-footer__link--next" aria-label="Next: Scaling to the Full DataSet" rel="next">
|
||||||
|
<div class="md-footer__title">
|
||||||
|
<div class="md-ellipsis">
|
||||||
|
<span class="md-footer__direction">
|
||||||
|
Next
|
||||||
|
</span>
|
||||||
|
Scaling to the Full DataSet
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="md-footer__button md-icon">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M4 11v2h12l-5.5 5.5 1.42 1.42L19.84 12l-7.92-7.92L10.5 5.5 16 11H4z"/></svg>
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
<div class="md-footer-meta md-typeset">
|
||||||
|
<div class="md-footer-meta__inner md-grid">
|
||||||
|
<div class="md-footer-copyright">
|
||||||
|
|
||||||
|
|
||||||
|
Made with
|
||||||
|
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
||||||
|
Material for MkDocs
|
||||||
|
</a>
|
||||||
|
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</footer>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
<div class="md-dialog" data-md-component="dialog">
|
||||||
|
<div class="md-dialog__inner md-typeset"></div>
|
||||||
|
</div>
|
||||||
|
<script id="__config" type="application/json">{"base": "..", "features": {"navigation.tabs": true}, "translations": {"clipboard.copy": "Copy to clipboard", "clipboard.copied": "Copied to clipboard", "search.config.lang": "en", "search.config.pipeline": "trimmer, stopWordFilter", "search.config.separator": "[\\s\\-]+", "search.placeholder": "Search", "search.result.placeholder": "Type to start searching", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.term.missing": "Missing", "select.version.title": "Select version"}, "search": "../assets/javascripts/workers/search.f8263e09.min.js", "version": null}</script>
|
||||||
|
|
||||||
|
|
||||||
|
<script src="../assets/javascripts/bundle.4fc53ad4.min.js"></script>
|
||||||
|
|
||||||
|
<script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
|
||||||
|
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||||||
|
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
780
dataflow/scaling.html
Normal file
780
dataflow/scaling.html
Normal file
@@ -0,0 +1,780 @@
|
|||||||
|
|
||||||
|
<!doctype html>
|
||||||
|
<html lang="en" class="no-js">
|
||||||
|
<head>
|
||||||
|
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<link rel="icon" href="../assets/images/favicon.png">
|
||||||
|
<meta name="generator" content="mkdocs-1.2.2, mkdocs-material-7.3.0">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<title>Scaling to the Full DataSet - The Street Group Technical Test</title>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<link rel="stylesheet" href="../assets/stylesheets/main.8b42a75e.min.css">
|
||||||
|
|
||||||
|
|
||||||
|
<link rel="stylesheet" href="../assets/stylesheets/palette.3f5d1f46.min.css">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<meta name="theme-color" content="#4051b5">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||||
|
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,400,400i,700%7CRoboto+Mono&display=fallback">
|
||||||
|
<style>:root{--md-text-font-family:"Roboto";--md-code-font-family:"Roboto Mono"}</style>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</head>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<body dir="ltr" data-md-color-scheme="" data-md-color-primary="indigo" data-md-color-accent="blue">
|
||||||
|
|
||||||
|
|
||||||
|
<script>function __prefix(e){return new URL("..",location).pathname+"."+e}function __get(e,t=localStorage){return JSON.parse(t.getItem(__prefix(e)))}</script>
|
||||||
|
|
||||||
|
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
||||||
|
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
||||||
|
<label class="md-overlay" for="__drawer"></label>
|
||||||
|
<div data-md-component="skip">
|
||||||
|
|
||||||
|
|
||||||
|
<a href="#scaling-to-the-full-dataset" class="md-skip">
|
||||||
|
Skip to content
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
<div data-md-component="announce">
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<header class="md-header" data-md-component="header">
|
||||||
|
<nav class="md-header__inner md-grid" aria-label="Header">
|
||||||
|
<a href=".." title="The Street Group Technical Test" class="md-header__button md-logo" aria-label="The Street Group Technical Test" data-md-component="logo">
|
||||||
|
|
||||||
|
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54z"/></svg>
|
||||||
|
|
||||||
|
</a>
|
||||||
|
<label class="md-header__button md-icon" for="__drawer">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2z"/></svg>
|
||||||
|
</label>
|
||||||
|
<div class="md-header__title" data-md-component="header-title">
|
||||||
|
<div class="md-header__ellipsis">
|
||||||
|
<div class="md-header__topic">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
The Street Group Technical Test
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div class="md-header__topic" data-md-component="header-topic">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Scaling to the Full DataSet
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-header__button md-icon" for="__search">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5z"/></svg>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<div class="md-search" data-md-component="search" role="dialog">
|
||||||
|
<label class="md-search__overlay" for="__search"></label>
|
||||||
|
<div class="md-search__inner" role="search">
|
||||||
|
<form class="md-search__form" name="search">
|
||||||
|
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
|
||||||
|
<label class="md-search__icon md-icon" for="__search">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5z"/></svg>
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12z"/></svg>
|
||||||
|
</label>
|
||||||
|
<nav class="md-search__options" aria-label="Search">
|
||||||
|
|
||||||
|
<button type="reset" class="md-search__icon md-icon" aria-label="Clear" tabindex="-1">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41z"/></svg>
|
||||||
|
</button>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</form>
|
||||||
|
<div class="md-search__output">
|
||||||
|
<div class="md-search__scrollwrap" data-md-scrollfix>
|
||||||
|
<div class="md-search-result" data-md-component="search-result">
|
||||||
|
<div class="md-search-result__meta">
|
||||||
|
Initializing search
|
||||||
|
</div>
|
||||||
|
<ol class="md-search-result__list"></ol>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
<div class="md-header__source">
|
||||||
|
|
||||||
|
<a href="https://github.com/dtomlinson91/street_group_tech_test/" title="Go to repository" class="md-source" data-md-component="source">
|
||||||
|
<div class="md-source__icon md-icon">
|
||||||
|
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
|
||||||
|
</div>
|
||||||
|
<div class="md-source__repository">
|
||||||
|
GitHub
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<div class="md-container" data-md-component="container">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<nav class="md-tabs" aria-label="Tabs" data-md-component="tabs">
|
||||||
|
<div class="md-tabs__inner md-grid">
|
||||||
|
<ul class="md-tabs__list">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="../index.html" class="md-tabs__link">
|
||||||
|
Documentation
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="../discussion/introduction.html" class="md-tabs__link">
|
||||||
|
Discussion
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="index.html" class="md-tabs__link md-tabs__link--active">
|
||||||
|
DataFlow
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="../pandas-profiling/report.html" class="md-tabs__link">
|
||||||
|
Data Exploration Report
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<main class="md-main" data-md-component="main">
|
||||||
|
<div class="md-main__inner md-grid">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
||||||
|
<div class="md-sidebar__scrollwrap">
|
||||||
|
<div class="md-sidebar__inner">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<nav class="md-nav md-nav--primary md-nav--lifted" aria-label="Navigation" data-md-level="0">
|
||||||
|
<label class="md-nav__title" for="__drawer">
|
||||||
|
<a href=".." title="The Street Group Technical Test" class="md-nav__button md-logo" aria-label="The Street Group Technical Test" data-md-component="logo">
|
||||||
|
|
||||||
|
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54z"/></svg>
|
||||||
|
|
||||||
|
</a>
|
||||||
|
The Street Group Technical Test
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<div class="md-nav__source">
|
||||||
|
|
||||||
|
<a href="https://github.com/dtomlinson91/street_group_tech_test/" title="Go to repository" class="md-source" data-md-component="source">
|
||||||
|
<div class="md-source__icon md-icon">
|
||||||
|
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
|
||||||
|
</div>
|
||||||
|
<div class="md-source__repository">
|
||||||
|
GitHub
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--nested">
|
||||||
|
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_1" type="checkbox" id="__nav_1" >
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link" for="__nav_1">
|
||||||
|
Documentation
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="Documentation" data-md-level="1">
|
||||||
|
<label class="md-nav__title" for="__nav_1">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Documentation
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../index.html" class="md-nav__link">
|
||||||
|
Welcome
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../documentation/installation.html" class="md-nav__link">
|
||||||
|
Installation
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../documentation/usage.html" class="md-nav__link">
|
||||||
|
Usage
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--nested">
|
||||||
|
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_2" type="checkbox" id="__nav_2" >
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link" for="__nav_2">
|
||||||
|
Discussion
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="Discussion" data-md-level="1">
|
||||||
|
<label class="md-nav__title" for="__nav_2">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Discussion
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../discussion/introduction.html" class="md-nav__link">
|
||||||
|
Introduction
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../discussion/exploration.html" class="md-nav__link">
|
||||||
|
Data Exploration Report
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../discussion/cleaning.html" class="md-nav__link">
|
||||||
|
Cleaning
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../discussion/approach.html" class="md-nav__link">
|
||||||
|
Approach
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../discussion/results.html" class="md-nav__link">
|
||||||
|
Results
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
||||||
|
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_3" type="checkbox" id="__nav_3" checked>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link" for="__nav_3">
|
||||||
|
DataFlow
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="DataFlow" data-md-level="1">
|
||||||
|
<label class="md-nav__title" for="__nav_3">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
DataFlow
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="index.html" class="md-nav__link">
|
||||||
|
Running on DataFlow
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--active">
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="toc" type="checkbox" id="__toc">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||||||
|
Scaling to the Full DataSet
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<a href="scaling.html" class="md-nav__link md-nav__link--active">
|
||||||
|
Scaling to the Full DataSet
|
||||||
|
</a>
|
||||||
|
|
||||||
|
|
||||||
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__title" for="__toc">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Table of contents
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#mapping-table" class="md-nav__link">
|
||||||
|
Mapping table
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#patterns" class="md-nav__link">
|
||||||
|
Patterns
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#solution" class="md-nav__link">
|
||||||
|
Solution
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../pandas-profiling/report.html" class="md-nav__link">
|
||||||
|
Data Exploration Report
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
||||||
|
<div class="md-sidebar__scrollwrap">
|
||||||
|
<div class="md-sidebar__inner">
|
||||||
|
|
||||||
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__title" for="__toc">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Table of contents
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#mapping-table" class="md-nav__link">
|
||||||
|
Mapping table
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#patterns" class="md-nav__link">
|
||||||
|
Patterns
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#solution" class="md-nav__link">
|
||||||
|
Solution
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
</nav>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
<div class="md-content" data-md-component="content">
|
||||||
|
<article class="md-content__inner md-typeset">
|
||||||
|
|
||||||
|
|
||||||
|
<a href="https://github.com/dtomlinson91/street_group_tech_test/edit/master/docs/dataflow/scaling.md" title="Edit this page" class="md-content__button md-icon">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25z"/></svg>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
|
||||||
|
<h1 id="scaling-to-the-full-dataset">Scaling to the full DataSet<a class="headerlink" href="#scaling-to-the-full-dataset" title="Permanent link">¶</a></h1>
|
||||||
|
<p>As is the pipeline will not run against the full dataset. But with a little work done to the existing pipeline I believe it is possible to work against the full dataset of ~27 million rows.</p>
|
||||||
|
<h2 id="mapping-table">Mapping table<a class="headerlink" href="#mapping-table" title="Permanent link">¶</a></h2>
|
||||||
|
<p>Using a mapping table as a side-input means that for the full dataset this table is going to be huge.</p>
|
||||||
|
<p>Side inputs are stored in memory on the workers, with such a huge table the machines are going to quickly run out of available memory when autoscaling is applied.</p>
|
||||||
|
<p>Running the pipeline against the full dataset resulted in the following error:</p>
|
||||||
|
<div class="highlight"><pre><span></span><code>"Out of memory: Killed process 2042 (python) total-vm:28616496kB, anon-rss:25684136kB, file-rss:0kB, shmem-rss:0kB, UID:0 pgtables:51284kB oom_score_adj:900"
|
||||||
|
</code></pre></div>
|
||||||
|
<p>with the pipeline job failing to process anything and the rows being processed per/sec gradually falling to zero as the workers killed the Python process to try free up more memory. This resulted in autoscaling down (as the CPU decreased) and the entire pipeline stagnated.</p>
|
||||||
|
<p>Using a higher tiered <code>worker_machine_type</code>, disabling autoscaling, and fixing the workers to the maximum number of vCPUs available to the quota results in pipeline options:</p>
|
||||||
|
<div class="highlight"><pre><span></span><code>--worker_machine_type<span class="o">=</span>n1-highmem-8 <span class="se">\</span>
|
||||||
|
--num_workers<span class="o">=</span><span class="m">3</span> <span class="se">\</span>
|
||||||
|
--autoscaling_algorithm<span class="o">=</span>NONE
|
||||||
|
</code></pre></div>
|
||||||
|
<p>with 156GB of RAM available to the pipeline with 52GB on each worker.</p>
|
||||||
|
<p>The pipeline was able to progress further until Python threw an error and the pipeline failed and shut down:</p>
|
||||||
|
<div class="highlight"><pre><span></span><code>"Error message from worker: Traceback (most recent call last):
|
||||||
|
File "/usr/local/lib/python3.7/site-packages/dataflow_worker/batchworker.py", line 651, in do_work
|
||||||
|
work_executor.execute()
|
||||||
|
...
|
||||||
|
File "/usr/local/lib/python3.7/multiprocessing/connection.py", line 393, in _send_bytes
|
||||||
|
header = struct.pack("!i", n)
|
||||||
|
struct.error: 'i' format requires -2147483648 <= number <= 2147483647
|
||||||
|
</code></pre></div>
|
||||||
|
<p>The number 2147483647 being the maximum value for a 32bit integer.</p>
|
||||||
|
<p>As the side-input needs to be pickled (or serialised), this tells us that the table is far too large to be pickled and passed to the other workers. No amount of CPU/Memory can fix the problem.</p>
|
||||||
|
<h2 id="patterns">Patterns<a class="headerlink" href="#patterns" title="Permanent link">¶</a></h2>
|
||||||
|
<p>Google have several patterns for large side-inputs which are documented here:</p>
|
||||||
|
<ul>
|
||||||
|
<li>Part 1 <a href="https://cloud.google.com/blog/products/data-analytics/guide-to-common-cloud-dataflow-use-case-patterns-part-1">https://cloud.google.com/blog/products/data-analytics/guide-to-common-cloud-dataflow-use-case-patterns-part-1</a></li>
|
||||||
|
<li>Part 2 <a href="https://cloud.google.com/blog/products/data-analytics/guide-to-common-cloud-dataflow-use-case-patterns-part-2">https://cloud.google.com/blog/products/data-analytics/guide-to-common-cloud-dataflow-use-case-patterns-part-2</a></li>
|
||||||
|
</ul>
|
||||||
|
<h2 id="solution">Solution<a class="headerlink" href="#solution" title="Permanent link">¶</a></h2>
|
||||||
|
<p>A possible solution would be to leverage BigQuery to store the results of the mapping table in as the pipeline progresses. We can make use of BigQueries array type to literally store the raw array as we process each row.</p>
|
||||||
|
<p>In addition to creating the mapping table <code>(key, value)</code> pairs, we also save these pairs to BigQuery at this stage. We then yield the element as it is currently written to allow the subsequent stages to make use of this data.</p>
|
||||||
|
<p>Remove the condense mapping table stage as it is no longer needed.</p>
|
||||||
|
<p>Instead of using:</p>
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="n">beam</span><span class="o">.</span><span class="n">FlatMap</span><span class="p">(</span>
|
||||||
|
<span class="n">insert_data_for_id</span><span class="p">,</span> <span class="n">beam</span><span class="o">.</span><span class="n">pvalue</span><span class="o">.</span><span class="n">AsSingleton</span><span class="p">(</span><span class="n">mapping_table_condensed</span><span class="p">)</span>
|
||||||
|
<span class="p">)</span>
|
||||||
|
</code></pre></div>
|
||||||
|
<p>to insert the results of the mapping table we write a new <code>DoFn</code> that takes the element, and for each <code>id_all_columns</code> in the array we make a call to BigQuery to get the array for this ID and insert it at this stage.</p>
|
||||||
|
<p>Because each <code>id_all_columns</code> and its corresponding data is only used once, there would be no need to cache the results from BigQuery, however some work could be done to see if we could pull back more than one row at a time and cache these, saving time/costs in calls to BigQuery.</p>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</article>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</main>
|
||||||
|
|
||||||
|
|
||||||
|
<footer class="md-footer">
|
||||||
|
|
||||||
|
<nav class="md-footer__inner md-grid" aria-label="Footer">
|
||||||
|
|
||||||
|
|
||||||
|
<a href="index.html" class="md-footer__link md-footer__link--prev" aria-label="Previous: Running on DataFlow" rel="prev">
|
||||||
|
<div class="md-footer__button md-icon">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12z"/></svg>
|
||||||
|
</div>
|
||||||
|
<div class="md-footer__title">
|
||||||
|
<div class="md-ellipsis">
|
||||||
|
<span class="md-footer__direction">
|
||||||
|
Previous
|
||||||
|
</span>
|
||||||
|
Running on DataFlow
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<a href="../pandas-profiling/report.html" class="md-footer__link md-footer__link--next" aria-label="Next: Data Exploration Report" rel="next">
|
||||||
|
<div class="md-footer__title">
|
||||||
|
<div class="md-ellipsis">
|
||||||
|
<span class="md-footer__direction">
|
||||||
|
Next
|
||||||
|
</span>
|
||||||
|
Data Exploration Report
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="md-footer__button md-icon">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M4 11v2h12l-5.5 5.5 1.42 1.42L19.84 12l-7.92-7.92L10.5 5.5 16 11H4z"/></svg>
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
<div class="md-footer-meta md-typeset">
|
||||||
|
<div class="md-footer-meta__inner md-grid">
|
||||||
|
<div class="md-footer-copyright">
|
||||||
|
|
||||||
|
|
||||||
|
Made with
|
||||||
|
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
||||||
|
Material for MkDocs
|
||||||
|
</a>
|
||||||
|
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</footer>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
<div class="md-dialog" data-md-component="dialog">
|
||||||
|
<div class="md-dialog__inner md-typeset"></div>
|
||||||
|
</div>
|
||||||
|
<script id="__config" type="application/json">{"base": "..", "features": {"navigation.tabs": true}, "translations": {"clipboard.copy": "Copy to clipboard", "clipboard.copied": "Copied to clipboard", "search.config.lang": "en", "search.config.pipeline": "trimmer, stopWordFilter", "search.config.separator": "[\\s\\-]+", "search.placeholder": "Search", "search.result.placeholder": "Type to start searching", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.term.missing": "Missing", "select.version.title": "Select version"}, "search": "../assets/javascripts/workers/search.f8263e09.min.js", "version": null}</script>
|
||||||
|
|
||||||
|
|
||||||
|
<script src="../assets/javascripts/bundle.4fc53ad4.min.js"></script>
|
||||||
|
|
||||||
|
<script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
|
||||||
|
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||||||
|
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
861
discussion/approach.html
Normal file
861
discussion/approach.html
Normal file
@@ -0,0 +1,861 @@
|
|||||||
|
|
||||||
|
<!doctype html>
|
||||||
|
<html lang="en" class="no-js">
|
||||||
|
<head>
|
||||||
|
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<link rel="icon" href="../assets/images/favicon.png">
|
||||||
|
<meta name="generator" content="mkdocs-1.2.2, mkdocs-material-7.3.0">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<title>Approach - The Street Group Technical Test</title>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<link rel="stylesheet" href="../assets/stylesheets/main.8b42a75e.min.css">
|
||||||
|
|
||||||
|
|
||||||
|
<link rel="stylesheet" href="../assets/stylesheets/palette.3f5d1f46.min.css">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<meta name="theme-color" content="#4051b5">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||||
|
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,400,400i,700%7CRoboto+Mono&display=fallback">
|
||||||
|
<style>:root{--md-text-font-family:"Roboto";--md-code-font-family:"Roboto Mono"}</style>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</head>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<body dir="ltr" data-md-color-scheme="" data-md-color-primary="indigo" data-md-color-accent="blue">
|
||||||
|
|
||||||
|
|
||||||
|
<script>function __prefix(e){return new URL("..",location).pathname+"."+e}function __get(e,t=localStorage){return JSON.parse(t.getItem(__prefix(e)))}</script>
|
||||||
|
|
||||||
|
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
||||||
|
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
||||||
|
<label class="md-overlay" for="__drawer"></label>
|
||||||
|
<div data-md-component="skip">
|
||||||
|
|
||||||
|
|
||||||
|
<a href="#approach" class="md-skip">
|
||||||
|
Skip to content
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
<div data-md-component="announce">
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<header class="md-header" data-md-component="header">
|
||||||
|
<nav class="md-header__inner md-grid" aria-label="Header">
|
||||||
|
<a href=".." title="The Street Group Technical Test" class="md-header__button md-logo" aria-label="The Street Group Technical Test" data-md-component="logo">
|
||||||
|
|
||||||
|
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54z"/></svg>
|
||||||
|
|
||||||
|
</a>
|
||||||
|
<label class="md-header__button md-icon" for="__drawer">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2z"/></svg>
|
||||||
|
</label>
|
||||||
|
<div class="md-header__title" data-md-component="header-title">
|
||||||
|
<div class="md-header__ellipsis">
|
||||||
|
<div class="md-header__topic">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
The Street Group Technical Test
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div class="md-header__topic" data-md-component="header-topic">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Approach
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-header__button md-icon" for="__search">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5z"/></svg>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<div class="md-search" data-md-component="search" role="dialog">
|
||||||
|
<label class="md-search__overlay" for="__search"></label>
|
||||||
|
<div class="md-search__inner" role="search">
|
||||||
|
<form class="md-search__form" name="search">
|
||||||
|
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
|
||||||
|
<label class="md-search__icon md-icon" for="__search">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5z"/></svg>
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12z"/></svg>
|
||||||
|
</label>
|
||||||
|
<nav class="md-search__options" aria-label="Search">
|
||||||
|
|
||||||
|
<button type="reset" class="md-search__icon md-icon" aria-label="Clear" tabindex="-1">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41z"/></svg>
|
||||||
|
</button>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</form>
|
||||||
|
<div class="md-search__output">
|
||||||
|
<div class="md-search__scrollwrap" data-md-scrollfix>
|
||||||
|
<div class="md-search-result" data-md-component="search-result">
|
||||||
|
<div class="md-search-result__meta">
|
||||||
|
Initializing search
|
||||||
|
</div>
|
||||||
|
<ol class="md-search-result__list"></ol>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
<div class="md-header__source">
|
||||||
|
|
||||||
|
<a href="https://github.com/dtomlinson91/street_group_tech_test/" title="Go to repository" class="md-source" data-md-component="source">
|
||||||
|
<div class="md-source__icon md-icon">
|
||||||
|
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
|
||||||
|
</div>
|
||||||
|
<div class="md-source__repository">
|
||||||
|
GitHub
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<div class="md-container" data-md-component="container">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<nav class="md-tabs" aria-label="Tabs" data-md-component="tabs">
|
||||||
|
<div class="md-tabs__inner md-grid">
|
||||||
|
<ul class="md-tabs__list">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="../index.html" class="md-tabs__link">
|
||||||
|
Documentation
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="introduction.html" class="md-tabs__link md-tabs__link--active">
|
||||||
|
Discussion
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="../dataflow/index.html" class="md-tabs__link">
|
||||||
|
DataFlow
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="../pandas-profiling/report.html" class="md-tabs__link">
|
||||||
|
Data Exploration Report
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<main class="md-main" data-md-component="main">
|
||||||
|
<div class="md-main__inner md-grid">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
||||||
|
<div class="md-sidebar__scrollwrap">
|
||||||
|
<div class="md-sidebar__inner">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<nav class="md-nav md-nav--primary md-nav--lifted" aria-label="Navigation" data-md-level="0">
|
||||||
|
<label class="md-nav__title" for="__drawer">
|
||||||
|
<a href=".." title="The Street Group Technical Test" class="md-nav__button md-logo" aria-label="The Street Group Technical Test" data-md-component="logo">
|
||||||
|
|
||||||
|
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54z"/></svg>
|
||||||
|
|
||||||
|
</a>
|
||||||
|
The Street Group Technical Test
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<div class="md-nav__source">
|
||||||
|
|
||||||
|
<a href="https://github.com/dtomlinson91/street_group_tech_test/" title="Go to repository" class="md-source" data-md-component="source">
|
||||||
|
<div class="md-source__icon md-icon">
|
||||||
|
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
|
||||||
|
</div>
|
||||||
|
<div class="md-source__repository">
|
||||||
|
GitHub
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--nested">
|
||||||
|
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_1" type="checkbox" id="__nav_1" >
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link" for="__nav_1">
|
||||||
|
Documentation
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="Documentation" data-md-level="1">
|
||||||
|
<label class="md-nav__title" for="__nav_1">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Documentation
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../index.html" class="md-nav__link">
|
||||||
|
Welcome
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../documentation/installation.html" class="md-nav__link">
|
||||||
|
Installation
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../documentation/usage.html" class="md-nav__link">
|
||||||
|
Usage
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
||||||
|
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_2" type="checkbox" id="__nav_2" checked>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link" for="__nav_2">
|
||||||
|
Discussion
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="Discussion" data-md-level="1">
|
||||||
|
<label class="md-nav__title" for="__nav_2">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Discussion
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="introduction.html" class="md-nav__link">
|
||||||
|
Introduction
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="exploration.html" class="md-nav__link">
|
||||||
|
Data Exploration Report
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="cleaning.html" class="md-nav__link">
|
||||||
|
Cleaning
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--active">
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="toc" type="checkbox" id="__toc">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||||||
|
Approach
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<a href="approach.html" class="md-nav__link md-nav__link--active">
|
||||||
|
Approach
|
||||||
|
</a>
|
||||||
|
|
||||||
|
|
||||||
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__title" for="__toc">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Table of contents
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#loading-stage" class="md-nav__link">
|
||||||
|
Loading stage
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#cleaning-stage" class="md-nav__link">
|
||||||
|
Cleaning stage
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#create-a-mapping-table" class="md-nav__link">
|
||||||
|
Create a mapping table
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#prepare-stage" class="md-nav__link">
|
||||||
|
Prepare stage
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#format-stage" class="md-nav__link">
|
||||||
|
Format stage
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#save-stage" class="md-nav__link">
|
||||||
|
Save stage
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="results.html" class="md-nav__link">
|
||||||
|
Results
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--nested">
|
||||||
|
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_3" type="checkbox" id="__nav_3" >
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link" for="__nav_3">
|
||||||
|
DataFlow
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="DataFlow" data-md-level="1">
|
||||||
|
<label class="md-nav__title" for="__nav_3">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
DataFlow
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../dataflow/index.html" class="md-nav__link">
|
||||||
|
Running on DataFlow
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../dataflow/scaling.html" class="md-nav__link">
|
||||||
|
Scaling to the Full DataSet
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../pandas-profiling/report.html" class="md-nav__link">
|
||||||
|
Data Exploration Report
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
||||||
|
<div class="md-sidebar__scrollwrap">
|
||||||
|
<div class="md-sidebar__inner">
|
||||||
|
|
||||||
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__title" for="__toc">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Table of contents
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#loading-stage" class="md-nav__link">
|
||||||
|
Loading stage
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#cleaning-stage" class="md-nav__link">
|
||||||
|
Cleaning stage
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#create-a-mapping-table" class="md-nav__link">
|
||||||
|
Create a mapping table
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#prepare-stage" class="md-nav__link">
|
||||||
|
Prepare stage
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#format-stage" class="md-nav__link">
|
||||||
|
Format stage
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#save-stage" class="md-nav__link">
|
||||||
|
Save stage
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
</nav>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
<div class="md-content" data-md-component="content">
|
||||||
|
<article class="md-content__inner md-typeset">
|
||||||
|
|
||||||
|
|
||||||
|
<a href="https://github.com/dtomlinson91/street_group_tech_test/edit/master/docs/discussion/approach.md" title="Edit this page" class="md-content__button md-icon">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25z"/></svg>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
|
||||||
|
<h1 id="approach">Approach<a class="headerlink" href="#approach" title="Permanent link">¶</a></h1>
|
||||||
|
<p>The general approach to the pipeline is:</p>
|
||||||
|
<h2 id="loading-stage">Loading stage<a class="headerlink" href="#loading-stage" title="Permanent link">¶</a></h2>
|
||||||
|
<ul>
|
||||||
|
<li>Load using <code class="highlight"><span class="n">beam</span><span class="o">.</span><span class="n">io</span><span class="o">.</span><span class="n">ReadFromText</span><span class="p">()</span></code></li>
|
||||||
|
<li>Split the string loaded by <code>,</code> as it's a comma delimited <code>.csv</code>.</li>
|
||||||
|
<li>Strip the leading/trailing <code>"</code> marks.</li>
|
||||||
|
</ul>
|
||||||
|
<p>The result is an array with each element representing a single column in that row.</p>
|
||||||
|
<h2 id="cleaning-stage">Cleaning stage<a class="headerlink" href="#cleaning-stage" title="Permanent link">¶</a></h2>
|
||||||
|
<p>Already discussed.</p>
|
||||||
|
<h2 id="create-a-mapping-table">Create a mapping table<a class="headerlink" href="#create-a-mapping-table" title="Permanent link">¶</a></h2>
|
||||||
|
<p>The mapping table takes each row and creates a <code>(key,value)</code> pair with:</p>
|
||||||
|
<ul>
|
||||||
|
<li>The key being the id across all columns (<code>id_all_columns</code>).</li>
|
||||||
|
<li>The value being the raw data as an array.</li>
|
||||||
|
</ul>
|
||||||
|
<p>The mapping table is then condensed to a single dictionary with these key, value pairs and is used as a side input further down the pipeline.</p>
|
||||||
|
<p>This mapping table is created to ensure the <code>GroupByKey</code> operation is as quick as possible. The more data you have to process in a <code>GroupByKey</code>, the longer the operation takes. By doing the <code>GroupByKey</code> using just the ids, the pipeline can process the files much quicker than if we included the raw data in this operation.</p>
|
||||||
|
<h2 id="prepare-stage">Prepare stage<a class="headerlink" href="#prepare-stage" title="Permanent link">¶</a></h2>
|
||||||
|
<ul>
|
||||||
|
<li>Take the mapping table data (before it is condensed) and create a unique id ignoring the price and date (<code>id_without_price_date</code>).</li>
|
||||||
|
</ul>
|
||||||
|
<p>This id will not be unique: for properties with more than one transaction they will share this id.</p>
|
||||||
|
<ul>
|
||||||
|
<li>Create a <code>(key, value)</code> pair with:<ul>
|
||||||
|
<li>The key being <code>id_without_price_date</code>.</li>
|
||||||
|
<li>The value being <code>id_all_columns</code>.</li>
|
||||||
|
</ul>
|
||||||
|
</li>
|
||||||
|
<li>Group by <code>id_without_price_date</code>.</li>
|
||||||
|
</ul>
|
||||||
|
<p>This results in a PCollection that looks like: <code>(id_without_price_date, [id_all_columns,...])</code></p>
|
||||||
|
<ul>
|
||||||
|
<li>Deduplicate the <code>id_all_columns</code> inside this array to eliminate repeated rows that are exactly the same.</li>
|
||||||
|
<li>Use the mapping table as a side input to reinsert the raw data using the <code>id_all_columns</code>.</li>
|
||||||
|
</ul>
|
||||||
|
<details>
|
||||||
|
<summary>Example for No.1 B90 3LA</summary>
|
||||||
|
|
||||||
|
Mapping table (pre condensed):
|
||||||
|
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="err">('</span><span class="kc">f</span><span class="err">d</span><span class="mi">4634</span><span class="kc">fae</span><span class="err">c</span><span class="mi">47</span><span class="err">c</span><span class="mi">29</span><span class="err">de</span><span class="mi">40</span><span class="err">bb</span><span class="kc">f</span><span class="mi">7840723</span><span class="err">b</span><span class="mi">41</span><span class="err">'</span><span class="p">,</span> <span class="p">[</span><span class="err">'</span><span class="mi">317500</span><span class="err">'</span><span class="p">,</span> <span class="err">'</span><span class="mi">2020-11-13</span> <span class="mi">00</span><span class="p">:</span><span class="mi">00</span><span class="err">'</span><span class="p">,</span> <span class="err">'B</span><span class="mi">90</span> <span class="mi">3</span><span class="err">LA'</span><span class="p">,</span> <span class="err">'</span><span class="mi">1</span><span class="err">'</span><span class="p">,</span> <span class="err">''</span><span class="p">,</span> <span class="err">'VERSTONE</span> <span class="err">ROAD'</span><span class="p">,</span> <span class="err">'SHIRLEY'</span><span class="p">,</span> <span class="err">'SOLIHULL'</span><span class="p">,</span> <span class="err">'SOLIHULL'</span><span class="p">,</span> <span class="err">'WEST</span> <span class="err">MIDLANDS'</span><span class="p">,</span> <span class="err">''</span><span class="p">]</span><span class="err">)</span>
|
||||||
|
<span class="err">('</span><span class="kc">f</span><span class="err">d</span><span class="mi">4634</span><span class="kc">fae</span><span class="err">c</span><span class="mi">47</span><span class="err">c</span><span class="mi">29</span><span class="err">de</span><span class="mi">40</span><span class="err">bb</span><span class="kc">f</span><span class="mi">7840723</span><span class="err">b</span><span class="mi">41</span><span class="err">'</span><span class="p">,</span> <span class="p">[</span><span class="err">'</span><span class="mi">317500</span><span class="err">'</span><span class="p">,</span> <span class="err">'</span><span class="mi">2020-11-13</span> <span class="mi">00</span><span class="p">:</span><span class="mi">00</span><span class="err">'</span><span class="p">,</span> <span class="err">'B</span><span class="mi">90</span> <span class="mi">3</span><span class="err">LA'</span><span class="p">,</span> <span class="err">'</span><span class="mi">1</span><span class="err">'</span><span class="p">,</span> <span class="err">''</span><span class="p">,</span> <span class="err">'VERSTONE</span> <span class="err">ROAD'</span><span class="p">,</span> <span class="err">'SHIRLEY'</span><span class="p">,</span> <span class="err">'SOLIHULL'</span><span class="p">,</span> <span class="err">'SOLIHULL'</span><span class="p">,</span> <span class="err">'WEST</span> <span class="err">MIDLANDS'</span><span class="p">,</span> <span class="err">''</span><span class="p">]</span><span class="err">)</span>
|
||||||
|
</code></pre></div>
|
||||||
|
|
||||||
|
Mapping table (condensed):
|
||||||
|
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="p">{</span><span class="err">'</span><span class="kc">f</span><span class="err">d</span><span class="mi">4634</span><span class="kc">fae</span><span class="err">c</span><span class="mi">47</span><span class="err">c</span><span class="mi">29</span><span class="err">de</span><span class="mi">40</span><span class="err">bb</span><span class="kc">f</span><span class="mi">7840723</span><span class="err">b</span><span class="mi">41</span><span class="err">'</span><span class="p">:</span> <span class="p">[</span><span class="err">'</span><span class="mi">317500</span><span class="err">'</span><span class="p">,</span> <span class="err">'</span><span class="mi">2020-11-13</span> <span class="mi">00</span><span class="p">:</span><span class="mi">00</span><span class="err">'</span><span class="p">,</span> <span class="err">'B</span><span class="mi">90</span> <span class="mi">3</span><span class="err">LA'</span><span class="p">,</span> <span class="err">'</span><span class="mi">1</span><span class="err">'</span><span class="p">,</span> <span class="err">''</span><span class="p">,</span> <span class="err">'VERSTONE</span> <span class="err">ROAD'</span><span class="p">,</span> <span class="err">'SHIRLEY'</span><span class="p">,</span> <span class="err">'SOLIHULL'</span><span class="p">,</span> <span class="err">'SOLIHULL'</span><span class="p">,</span> <span class="err">'WEST</span> <span class="err">MIDLANDS'</span><span class="p">,</span> <span class="err">''</span><span class="p">]}</span>
|
||||||
|
</code></pre></div>
|
||||||
|
|
||||||
|
Prepared (key, value):
|
||||||
|
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="err">('</span><span class="kc">fe</span><span class="mi">205</span><span class="err">b</span><span class="kc">fe</span><span class="mi">66</span><span class="err">bc</span><span class="mi">7</span><span class="kc">f</span><span class="mi">18</span><span class="err">c</span><span class="mi">50</span><span class="err">c</span><span class="mi">8</span><span class="kc">f</span><span class="mi">3</span><span class="err">d</span><span class="mf">77e</span><span class="err">c</span><span class="mf">3e30</span><span class="err">'</span><span class="p">,</span> <span class="err">'</span><span class="kc">f</span><span class="err">d</span><span class="mi">4634</span><span class="kc">fae</span><span class="err">c</span><span class="mi">47</span><span class="err">c</span><span class="mi">29</span><span class="err">de</span><span class="mi">40</span><span class="err">bb</span><span class="kc">f</span><span class="mi">7840723</span><span class="err">b</span><span class="mi">41</span><span class="err">')</span>
|
||||||
|
<span class="err">('</span><span class="kc">fe</span><span class="mi">205</span><span class="err">b</span><span class="kc">fe</span><span class="mi">66</span><span class="err">bc</span><span class="mi">7</span><span class="kc">f</span><span class="mi">18</span><span class="err">c</span><span class="mi">50</span><span class="err">c</span><span class="mi">8</span><span class="kc">f</span><span class="mi">3</span><span class="err">d</span><span class="mf">77e</span><span class="err">c</span><span class="mf">3e30</span><span class="err">'</span><span class="p">,</span> <span class="err">'</span><span class="kc">f</span><span class="err">d</span><span class="mi">4634</span><span class="kc">fae</span><span class="err">c</span><span class="mi">47</span><span class="err">c</span><span class="mi">29</span><span class="err">de</span><span class="mi">40</span><span class="err">bb</span><span class="kc">f</span><span class="mi">7840723</span><span class="err">b</span><span class="mi">41</span><span class="err">')</span>
|
||||||
|
</code></pre></div>
|
||||||
|
|
||||||
|
Prepared (GroupByKey):
|
||||||
|
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="err">('</span><span class="kc">fe</span><span class="mi">205</span><span class="err">b</span><span class="kc">fe</span><span class="mi">66</span><span class="err">bc</span><span class="mi">7</span><span class="kc">f</span><span class="mi">18</span><span class="err">c</span><span class="mi">50</span><span class="err">c</span><span class="mi">8</span><span class="kc">f</span><span class="mi">3</span><span class="err">d</span><span class="mf">77e</span><span class="err">c</span><span class="mf">3e30</span><span class="err">'</span><span class="p">,</span> <span class="p">[</span><span class="err">'</span><span class="kc">f</span><span class="err">d</span><span class="mi">4634</span><span class="kc">fae</span><span class="err">c</span><span class="mi">47</span><span class="err">c</span><span class="mi">29</span><span class="err">de</span><span class="mi">40</span><span class="err">bb</span><span class="kc">f</span><span class="mi">7840723</span><span class="err">b</span><span class="mi">41</span><span class="err">'</span><span class="p">,</span> <span class="err">'</span><span class="kc">f</span><span class="err">d</span><span class="mi">4634</span><span class="kc">fae</span><span class="err">c</span><span class="mi">47</span><span class="err">c</span><span class="mi">29</span><span class="err">de</span><span class="mi">40</span><span class="err">bb</span><span class="kc">f</span><span class="mi">7840723</span><span class="err">b</span><span class="mi">41</span><span class="err">'</span><span class="p">]</span><span class="err">)</span>
|
||||||
|
</code></pre></div>
|
||||||
|
|
||||||
|
Prepared (Deduplicated):
|
||||||
|
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="err">('</span><span class="kc">fe</span><span class="mi">205</span><span class="err">b</span><span class="kc">fe</span><span class="mi">66</span><span class="err">bc</span><span class="mi">7</span><span class="kc">f</span><span class="mi">18</span><span class="err">c</span><span class="mi">50</span><span class="err">c</span><span class="mi">8</span><span class="kc">f</span><span class="mi">3</span><span class="err">d</span><span class="mf">77e</span><span class="err">c</span><span class="mf">3e30</span><span class="err">'</span><span class="p">,</span> <span class="p">[</span><span class="err">'</span><span class="kc">f</span><span class="err">d</span><span class="mi">4634</span><span class="kc">fae</span><span class="err">c</span><span class="mi">47</span><span class="err">c</span><span class="mi">29</span><span class="err">de</span><span class="mi">40</span><span class="err">bb</span><span class="kc">f</span><span class="mi">7840723</span><span class="err">b</span><span class="mi">41</span><span class="err">'</span><span class="p">]</span><span class="err">)</span>
|
||||||
|
</code></pre></div>
|
||||||
|
|
||||||
|
Use mapping table as side input:
|
||||||
|
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="err">('</span><span class="kc">fe</span><span class="mi">205</span><span class="err">b</span><span class="kc">fe</span><span class="mi">66</span><span class="err">bc</span><span class="mi">7</span><span class="kc">f</span><span class="mi">18</span><span class="err">c</span><span class="mi">50</span><span class="err">c</span><span class="mi">8</span><span class="kc">f</span><span class="mi">3</span><span class="err">d</span><span class="mf">77e</span><span class="err">c</span><span class="mf">3e30</span><span class="err">'</span><span class="p">,</span> <span class="p">[</span><span class="err">'</span><span class="mi">317500</span><span class="err">'</span><span class="p">,</span> <span class="err">'</span><span class="mi">2020-11-13</span> <span class="mi">00</span><span class="p">:</span><span class="mi">00</span><span class="err">'</span><span class="p">,</span> <span class="err">'B</span><span class="mi">90</span> <span class="mi">3</span><span class="err">LA'</span><span class="p">,</span> <span class="err">'</span><span class="mi">1</span><span class="err">'</span><span class="p">,</span> <span class="err">''</span><span class="p">,</span> <span class="err">'VERSTONE</span> <span class="err">ROAD'</span><span class="p">,</span> <span class="err">'SHIRLEY'</span><span class="p">,</span> <span class="err">'SOLIHULL'</span><span class="p">,</span> <span class="err">'SOLIHULL'</span><span class="p">,</span> <span class="err">'WEST</span> <span class="err">MIDLANDS'</span><span class="p">,</span> <span class="err">''</span><span class="p">]</span><span class="err">)</span>
|
||||||
|
</code></pre></div>
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<h2 id="format-stage">Format stage<a class="headerlink" href="#format-stage" title="Permanent link">¶</a></h2>
|
||||||
|
<p>This stage takes the result and constructs a <code>json</code> object out of the grouped data. The schema for this output is discussed in the following page.</p>
|
||||||
|
<h2 id="save-stage">Save stage<a class="headerlink" href="#save-stage" title="Permanent link">¶</a></h2>
|
||||||
|
<ul>
|
||||||
|
<li>The PCollection is combined with <code class="highlight"><span class="n">beam</span><span class="o">.</span><span class="n">combiners</span><span class="o">.</span><span class="n">ToList</span><span class="p">()</span></code></li>
|
||||||
|
<li>Apply <code>json.dumps()</code> for proper quotation marks for strings.</li>
|
||||||
|
<li>Write to text with <code class="highlight"><span class="n">beam</span><span class="o">.</span><span class="n">io</span><span class="o">.</span><span class="n">WriteToText</span></code>.</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</article>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</main>
|
||||||
|
|
||||||
|
|
||||||
|
<footer class="md-footer">
|
||||||
|
|
||||||
|
<nav class="md-footer__inner md-grid" aria-label="Footer">
|
||||||
|
|
||||||
|
|
||||||
|
<a href="cleaning.html" class="md-footer__link md-footer__link--prev" aria-label="Previous: Cleaning" rel="prev">
|
||||||
|
<div class="md-footer__button md-icon">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12z"/></svg>
|
||||||
|
</div>
|
||||||
|
<div class="md-footer__title">
|
||||||
|
<div class="md-ellipsis">
|
||||||
|
<span class="md-footer__direction">
|
||||||
|
Previous
|
||||||
|
</span>
|
||||||
|
Cleaning
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<a href="results.html" class="md-footer__link md-footer__link--next" aria-label="Next: Results" rel="next">
|
||||||
|
<div class="md-footer__title">
|
||||||
|
<div class="md-ellipsis">
|
||||||
|
<span class="md-footer__direction">
|
||||||
|
Next
|
||||||
|
</span>
|
||||||
|
Results
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="md-footer__button md-icon">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M4 11v2h12l-5.5 5.5 1.42 1.42L19.84 12l-7.92-7.92L10.5 5.5 16 11H4z"/></svg>
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
<div class="md-footer-meta md-typeset">
|
||||||
|
<div class="md-footer-meta__inner md-grid">
|
||||||
|
<div class="md-footer-copyright">
|
||||||
|
|
||||||
|
|
||||||
|
Made with
|
||||||
|
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
||||||
|
Material for MkDocs
|
||||||
|
</a>
|
||||||
|
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</footer>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
<div class="md-dialog" data-md-component="dialog">
|
||||||
|
<div class="md-dialog__inner md-typeset"></div>
|
||||||
|
</div>
|
||||||
|
<script id="__config" type="application/json">{"base": "..", "features": {"navigation.tabs": true}, "translations": {"clipboard.copy": "Copy to clipboard", "clipboard.copied": "Copied to clipboard", "search.config.lang": "en", "search.config.pipeline": "trimmer, stopWordFilter", "search.config.separator": "[\\s\\-]+", "search.placeholder": "Search", "search.result.placeholder": "Type to start searching", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.term.missing": "Missing", "select.version.title": "Select version"}, "search": "../assets/javascripts/workers/search.f8263e09.min.js", "version": null}</script>
|
||||||
|
|
||||||
|
|
||||||
|
<script src="../assets/javascripts/bundle.4fc53ad4.min.js"></script>
|
||||||
|
|
||||||
|
<script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
|
||||||
|
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||||||
|
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
962
discussion/cleaning.html
Normal file
962
discussion/cleaning.html
Normal file
@@ -0,0 +1,962 @@
|
|||||||
|
|
||||||
|
<!doctype html>
|
||||||
|
<html lang="en" class="no-js">
|
||||||
|
<head>
|
||||||
|
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<link rel="icon" href="../assets/images/favicon.png">
|
||||||
|
<meta name="generator" content="mkdocs-1.2.2, mkdocs-material-7.3.0">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<title>Cleaning - The Street Group Technical Test</title>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<link rel="stylesheet" href="../assets/stylesheets/main.8b42a75e.min.css">
|
||||||
|
|
||||||
|
|
||||||
|
<link rel="stylesheet" href="../assets/stylesheets/palette.3f5d1f46.min.css">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<meta name="theme-color" content="#4051b5">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||||
|
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,400,400i,700%7CRoboto+Mono&display=fallback">
|
||||||
|
<style>:root{--md-text-font-family:"Roboto";--md-code-font-family:"Roboto Mono"}</style>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</head>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<body dir="ltr" data-md-color-scheme="" data-md-color-primary="indigo" data-md-color-accent="blue">
|
||||||
|
|
||||||
|
|
||||||
|
<script>function __prefix(e){return new URL("..",location).pathname+"."+e}function __get(e,t=localStorage){return JSON.parse(t.getItem(__prefix(e)))}</script>
|
||||||
|
|
||||||
|
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
||||||
|
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
||||||
|
<label class="md-overlay" for="__drawer"></label>
|
||||||
|
<div data-md-component="skip">
|
||||||
|
|
||||||
|
|
||||||
|
<a href="#cleaning" class="md-skip">
|
||||||
|
Skip to content
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
<div data-md-component="announce">
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<header class="md-header" data-md-component="header">
|
||||||
|
<nav class="md-header__inner md-grid" aria-label="Header">
|
||||||
|
<a href=".." title="The Street Group Technical Test" class="md-header__button md-logo" aria-label="The Street Group Technical Test" data-md-component="logo">
|
||||||
|
|
||||||
|
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54z"/></svg>
|
||||||
|
|
||||||
|
</a>
|
||||||
|
<label class="md-header__button md-icon" for="__drawer">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2z"/></svg>
|
||||||
|
</label>
|
||||||
|
<div class="md-header__title" data-md-component="header-title">
|
||||||
|
<div class="md-header__ellipsis">
|
||||||
|
<div class="md-header__topic">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
The Street Group Technical Test
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div class="md-header__topic" data-md-component="header-topic">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Cleaning
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-header__button md-icon" for="__search">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5z"/></svg>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<div class="md-search" data-md-component="search" role="dialog">
|
||||||
|
<label class="md-search__overlay" for="__search"></label>
|
||||||
|
<div class="md-search__inner" role="search">
|
||||||
|
<form class="md-search__form" name="search">
|
||||||
|
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
|
||||||
|
<label class="md-search__icon md-icon" for="__search">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5z"/></svg>
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12z"/></svg>
|
||||||
|
</label>
|
||||||
|
<nav class="md-search__options" aria-label="Search">
|
||||||
|
|
||||||
|
<button type="reset" class="md-search__icon md-icon" aria-label="Clear" tabindex="-1">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41z"/></svg>
|
||||||
|
</button>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</form>
|
||||||
|
<div class="md-search__output">
|
||||||
|
<div class="md-search__scrollwrap" data-md-scrollfix>
|
||||||
|
<div class="md-search-result" data-md-component="search-result">
|
||||||
|
<div class="md-search-result__meta">
|
||||||
|
Initializing search
|
||||||
|
</div>
|
||||||
|
<ol class="md-search-result__list"></ol>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
<div class="md-header__source">
|
||||||
|
|
||||||
|
<a href="https://github.com/dtomlinson91/street_group_tech_test/" title="Go to repository" class="md-source" data-md-component="source">
|
||||||
|
<div class="md-source__icon md-icon">
|
||||||
|
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
|
||||||
|
</div>
|
||||||
|
<div class="md-source__repository">
|
||||||
|
GitHub
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<div class="md-container" data-md-component="container">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<nav class="md-tabs" aria-label="Tabs" data-md-component="tabs">
|
||||||
|
<div class="md-tabs__inner md-grid">
|
||||||
|
<ul class="md-tabs__list">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="../index.html" class="md-tabs__link">
|
||||||
|
Documentation
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="introduction.html" class="md-tabs__link md-tabs__link--active">
|
||||||
|
Discussion
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="../dataflow/index.html" class="md-tabs__link">
|
||||||
|
DataFlow
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="../pandas-profiling/report.html" class="md-tabs__link">
|
||||||
|
Data Exploration Report
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<main class="md-main" data-md-component="main">
|
||||||
|
<div class="md-main__inner md-grid">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
||||||
|
<div class="md-sidebar__scrollwrap">
|
||||||
|
<div class="md-sidebar__inner">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<nav class="md-nav md-nav--primary md-nav--lifted" aria-label="Navigation" data-md-level="0">
|
||||||
|
<label class="md-nav__title" for="__drawer">
|
||||||
|
<a href=".." title="The Street Group Technical Test" class="md-nav__button md-logo" aria-label="The Street Group Technical Test" data-md-component="logo">
|
||||||
|
|
||||||
|
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54z"/></svg>
|
||||||
|
|
||||||
|
</a>
|
||||||
|
The Street Group Technical Test
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<div class="md-nav__source">
|
||||||
|
|
||||||
|
<a href="https://github.com/dtomlinson91/street_group_tech_test/" title="Go to repository" class="md-source" data-md-component="source">
|
||||||
|
<div class="md-source__icon md-icon">
|
||||||
|
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
|
||||||
|
</div>
|
||||||
|
<div class="md-source__repository">
|
||||||
|
GitHub
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--nested">
|
||||||
|
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_1" type="checkbox" id="__nav_1" >
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link" for="__nav_1">
|
||||||
|
Documentation
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="Documentation" data-md-level="1">
|
||||||
|
<label class="md-nav__title" for="__nav_1">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Documentation
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../index.html" class="md-nav__link">
|
||||||
|
Welcome
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../documentation/installation.html" class="md-nav__link">
|
||||||
|
Installation
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../documentation/usage.html" class="md-nav__link">
|
||||||
|
Usage
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
||||||
|
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_2" type="checkbox" id="__nav_2" checked>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link" for="__nav_2">
|
||||||
|
Discussion
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="Discussion" data-md-level="1">
|
||||||
|
<label class="md-nav__title" for="__nav_2">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Discussion
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="introduction.html" class="md-nav__link">
|
||||||
|
Introduction
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="exploration.html" class="md-nav__link">
|
||||||
|
Data Exploration Report
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--active">
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="toc" type="checkbox" id="__toc">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||||||
|
Cleaning
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<a href="cleaning.html" class="md-nav__link md-nav__link--active">
|
||||||
|
Cleaning
|
||||||
|
</a>
|
||||||
|
|
||||||
|
|
||||||
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__title" for="__toc">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Table of contents
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#uniquely-identify-a-property" class="md-nav__link">
|
||||||
|
Uniquely identify a property.
|
||||||
|
</a>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="Uniquely identify a property.">
|
||||||
|
<ul class="md-nav__list">
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#postcode" class="md-nav__link">
|
||||||
|
Postcode
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#paonsaon" class="md-nav__link">
|
||||||
|
PAON/SAON
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#unneeded-columns" class="md-nav__link">
|
||||||
|
Unneeded columns
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#general-cleaning" class="md-nav__link">
|
||||||
|
General cleaning
|
||||||
|
</a>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="General cleaning">
|
||||||
|
<ul class="md-nav__list">
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#upper-case" class="md-nav__link">
|
||||||
|
Upper case
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#strip-leadingtrailing-whitespace" class="md-nav__link">
|
||||||
|
Strip leading/trailing whitespace
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#repeated-rows" class="md-nav__link">
|
||||||
|
Repeated rows
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="approach.html" class="md-nav__link">
|
||||||
|
Approach
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="results.html" class="md-nav__link">
|
||||||
|
Results
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--nested">
|
||||||
|
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_3" type="checkbox" id="__nav_3" >
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link" for="__nav_3">
|
||||||
|
DataFlow
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="DataFlow" data-md-level="1">
|
||||||
|
<label class="md-nav__title" for="__nav_3">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
DataFlow
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../dataflow/index.html" class="md-nav__link">
|
||||||
|
Running on DataFlow
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../dataflow/scaling.html" class="md-nav__link">
|
||||||
|
Scaling to the Full DataSet
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../pandas-profiling/report.html" class="md-nav__link">
|
||||||
|
Data Exploration Report
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
||||||
|
<div class="md-sidebar__scrollwrap">
|
||||||
|
<div class="md-sidebar__inner">
|
||||||
|
|
||||||
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__title" for="__toc">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Table of contents
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#uniquely-identify-a-property" class="md-nav__link">
|
||||||
|
Uniquely identify a property.
|
||||||
|
</a>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="Uniquely identify a property.">
|
||||||
|
<ul class="md-nav__list">
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#postcode" class="md-nav__link">
|
||||||
|
Postcode
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#paonsaon" class="md-nav__link">
|
||||||
|
PAON/SAON
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#unneeded-columns" class="md-nav__link">
|
||||||
|
Unneeded columns
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#general-cleaning" class="md-nav__link">
|
||||||
|
General cleaning
|
||||||
|
</a>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="General cleaning">
|
||||||
|
<ul class="md-nav__list">
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#upper-case" class="md-nav__link">
|
||||||
|
Upper case
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#strip-leadingtrailing-whitespace" class="md-nav__link">
|
||||||
|
Strip leading/trailing whitespace
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#repeated-rows" class="md-nav__link">
|
||||||
|
Repeated rows
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
</nav>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
<div class="md-content" data-md-component="content">
|
||||||
|
<article class="md-content__inner md-typeset">
|
||||||
|
|
||||||
|
|
||||||
|
<a href="https://github.com/dtomlinson91/street_group_tech_test/edit/master/docs/discussion/cleaning.md" title="Edit this page" class="md-content__button md-icon">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25z"/></svg>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
|
||||||
|
<h1 id="cleaning">Cleaning<a class="headerlink" href="#cleaning" title="Permanent link">¶</a></h1>
|
||||||
|
<p>In this page we discuss the cleaning stages and how best to prepare the data.</p>
|
||||||
|
<h2 id="uniquely-identify-a-property">Uniquely identify a property.<a class="headerlink" href="#uniquely-identify-a-property" title="Permanent link">¶</a></h2>
|
||||||
|
<p>To uniquely identify a property with the data we have it is enough to have a Postcode and the PAON (or SAON or combination of both).</p>
|
||||||
|
<h3 id="postcode">Postcode<a class="headerlink" href="#postcode" title="Permanent link">¶</a></h3>
|
||||||
|
<p>Because so few properties are missing a postcode (0.2% of all records) we will drop all rows that do not have one. We will drop some properties that could be identified uniquely with some more work, but the properties that are missing a postcode tend to be unusual/commercial/industrial (e.g a powerplant).</p>
|
||||||
|
<h3 id="paonsaon">PAON/SAON<a class="headerlink" href="#paonsaon" title="Permanent link">¶</a></h3>
|
||||||
|
<p>The PAON has 3 possible formats:</p>
|
||||||
|
<ul>
|
||||||
|
<li>The street number.</li>
|
||||||
|
<li>The building name.</li>
|
||||||
|
<li>The building name and street number (comma delimited).</li>
|
||||||
|
</ul>
|
||||||
|
<p>The SAON:</p>
|
||||||
|
<ul>
|
||||||
|
<li>Identifies the appartment/flat number for the building.</li>
|
||||||
|
<li>If the SAON is present (only 11.7% of values) then the PAON will either be<ul>
|
||||||
|
<li>The building name.</li>
|
||||||
|
<li>The building name and street number.</li>
|
||||||
|
</ul>
|
||||||
|
</li>
|
||||||
|
</ul>
|
||||||
|
<p>Because of the way the PAON and SOAN are defined, if any row is missing <strong>both</strong> of these columns we will drop it. As only having the postcode is not enough (generally speaking) to uniquely identify a property.</p>
|
||||||
|
<div class="admonition tip">
|
||||||
|
<p class="admonition-title">Tip</p>
|
||||||
|
<p>In a production environment we could send these rows to a sink table (in BigQuery for example), rather than drop them outright. Collecting these rows over time might show some patterns on how we can uniquely identify properties that are missing these fields.</p>
|
||||||
|
</div>
|
||||||
|
<p>We split the PAON as part of the cleaning stage. If the PAON contains a comma then it contains the building name and street number. We keep the street number in the same position as the PAON and insert the building name as a new column at the end of the row. If the PAON does not contain a comma we insert a blank column at the end to keep the number of columns in the PCollection consistent.</p>
|
||||||
|
<h3 id="unneeded-columns">Unneeded columns<a class="headerlink" href="#unneeded-columns" title="Permanent link">¶</a></h3>
|
||||||
|
<p>To try keep computation costs/time down, I decided to drop the categorical columns provided. These include:</p>
|
||||||
|
<ul>
|
||||||
|
<li>Property Type.</li>
|
||||||
|
<li>Old/New.</li>
|
||||||
|
<li>Duration.</li>
|
||||||
|
<li>PPD Category Type.</li>
|
||||||
|
<li>Record Status - monthly file only.</li>
|
||||||
|
</ul>
|
||||||
|
<p>Initially I was attempting to work against the full dataset so dropping these columns would make a difference in the amount of data that needs processing.</p>
|
||||||
|
<p>These columns are also not consistent. E.g the property <code>63</code> <code>B16, 0AE</code> has three transactions. Two of these transactions have a property type of <code>Other</code> and one transaction has a property type of <code>Terraced</code>.</p>
|
||||||
|
<p>These columns do provide some relevant information (old/new, duration, property type) and these could be included back into the pipeline fairly easily. Due to time constraints I was unable to make this change.</p>
|
||||||
|
<p>In addition, I also dropped the transaction unique identifier column. I wanted the IDs calculated in the pipeline to be consistent in format, and hashing a string (md5) isn't that expensive to calculate with complexity <span class="arithmatex">\(\mathcal{O}(n)\)</span>.</p>
|
||||||
|
<h3 id="general-cleaning">General cleaning<a class="headerlink" href="#general-cleaning" title="Permanent link">¶</a></h3>
|
||||||
|
<h4 id="upper-case">Upper case<a class="headerlink" href="#upper-case" title="Permanent link">¶</a></h4>
|
||||||
|
<p>As all strings in the dataset are upper case, we convert everything in the row to upper case to enforce consistency across the dataset.</p>
|
||||||
|
<h4 id="strip-leadingtrailing-whitespace">Strip leading/trailing whitespace<a class="headerlink" href="#strip-leadingtrailing-whitespace" title="Permanent link">¶</a></h4>
|
||||||
|
<p>We strip all leading/trailing whitespace from each column to enforce consistency.</p>
|
||||||
|
<h4 id="repeated-rows">Repeated rows<a class="headerlink" href="#repeated-rows" title="Permanent link">¶</a></h4>
|
||||||
|
<p>Some of the data is repeated:</p>
|
||||||
|
<ul>
|
||||||
|
<li>Some rows repeated, with the same date + price + address information but with a unique transaction id.</li>
|
||||||
|
</ul>
|
||||||
|
<details>
|
||||||
|
<summary>Example (PCollection)</summary>
|
||||||
|
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="p">[</span>
|
||||||
|
<span class="p">{</span>
|
||||||
|
<span class="nt">"fd4634faec47c29de40bbf7840723b41"</span><span class="p">:</span> <span class="p">[</span>
|
||||||
|
<span class="s2">"317500"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"2020-11-13 00:00"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"B90 3LA"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"1"</span><span class="p">,</span>
|
||||||
|
<span class="s2">""</span><span class="p">,</span>
|
||||||
|
<span class="s2">"VERSTONE ROAD"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"SHIRLEY"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"SOLIHULL"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"SOLIHULL"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"WEST MIDLANDS"</span><span class="p">,</span>
|
||||||
|
<span class="s2">""</span>
|
||||||
|
<span class="p">]</span>
|
||||||
|
<span class="p">},</span>
|
||||||
|
<span class="p">{</span>
|
||||||
|
<span class="nt">"fd4634faec47c29de40bbf7840723b41"</span><span class="p">:</span> <span class="p">[</span>
|
||||||
|
<span class="s2">"317500"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"2020-11-13 00:00"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"B90 3LA"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"1"</span><span class="p">,</span>
|
||||||
|
<span class="s2">""</span><span class="p">,</span>
|
||||||
|
<span class="s2">"VERSTONE ROAD"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"SHIRLEY"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"SOLIHULL"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"SOLIHULL"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"WEST MIDLANDS"</span><span class="p">,</span>
|
||||||
|
<span class="s2">""</span>
|
||||||
|
<span class="p">]</span>
|
||||||
|
<span class="p">}</span>
|
||||||
|
<span class="p">]</span>
|
||||||
|
</code></pre></div>
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<p>These rows will be deduplicated as part of the pipeline.</p>
|
||||||
|
<ul>
|
||||||
|
<li>Some rows have the same date + address information, but different prices.</li>
|
||||||
|
</ul>
|
||||||
|
<p>It would be very unusual to see multiple transactions on the same date for the same property. One reason could be that there was a data entry error, resulting in two different transactions with only one being the real price. As the date column does not contain the time (it is fixed at <code>00:00</code>) it is impossible to tell.</p>
|
||||||
|
<p>Another reason could be missing building/flat/appartment information in this entry.</p>
|
||||||
|
<p>We <strong>keep</strong> these in the data, resulting in some properties having multiple transactions with different prices on the same date. Without a time or more information to go on, it is difficult to see how these could be filtered out.</p>
|
||||||
|
<details>
|
||||||
|
<summary>Example (Output)</summary>
|
||||||
|
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="p">[</span>
|
||||||
|
<span class="p">{</span>
|
||||||
|
<span class="nt">"property_id"</span><span class="p">:</span> <span class="s2">"20d5c335c8d822a40baab0ecd57e92a4"</span><span class="p">,</span>
|
||||||
|
<span class="nt">"readable_address"</span><span class="p">:</span> <span class="s2">"53 PAVENHAM DRIVE\nBIRMINGHAM\nWEST MIDLANDS\nB5 7TN"</span><span class="p">,</span>
|
||||||
|
<span class="nt">"flat_appartment"</span><span class="p">:</span> <span class="s2">""</span><span class="p">,</span>
|
||||||
|
<span class="nt">"builing"</span><span class="p">:</span> <span class="s2">""</span><span class="p">,</span>
|
||||||
|
<span class="nt">"number"</span><span class="p">:</span> <span class="s2">"53"</span><span class="p">,</span>
|
||||||
|
<span class="nt">"street"</span><span class="p">:</span> <span class="s2">"PAVENHAM DRIVE"</span><span class="p">,</span>
|
||||||
|
<span class="nt">"locality"</span><span class="p">:</span> <span class="s2">""</span><span class="p">,</span>
|
||||||
|
<span class="nt">"town"</span><span class="p">:</span> <span class="s2">"BIRMINGHAM"</span><span class="p">,</span>
|
||||||
|
<span class="nt">"district"</span><span class="p">:</span> <span class="s2">"BIRMINGHAM"</span><span class="p">,</span>
|
||||||
|
<span class="nt">"county"</span><span class="p">:</span> <span class="s2">"WEST MIDLANDS"</span><span class="p">,</span>
|
||||||
|
<span class="nt">"postcode"</span><span class="p">:</span> <span class="s2">"B5 7TN"</span><span class="p">,</span>
|
||||||
|
<span class="nt">"property_transactions"</span><span class="p">:</span> <span class="p">[</span>
|
||||||
|
<span class="p">{</span>
|
||||||
|
<span class="nt">"price"</span><span class="p">:</span> <span class="mi">270000</span><span class="p">,</span>
|
||||||
|
<span class="nt">"transaction_date"</span><span class="p">:</span> <span class="s2">"2020-04-23"</span><span class="p">,</span>
|
||||||
|
<span class="nt">"year"</span><span class="p">:</span> <span class="mi">2020</span>
|
||||||
|
<span class="p">},</span>
|
||||||
|
<span class="p">{</span>
|
||||||
|
<span class="nt">"price"</span><span class="p">:</span> <span class="mi">364000</span><span class="p">,</span>
|
||||||
|
<span class="nt">"transaction_date"</span><span class="p">:</span> <span class="s2">"2020-04-23"</span><span class="p">,</span>
|
||||||
|
<span class="nt">"year"</span><span class="p">:</span> <span class="mi">2020</span>
|
||||||
|
<span class="p">}</span>
|
||||||
|
<span class="p">],</span>
|
||||||
|
<span class="nt">"latest_transaction_year"</span><span class="p">:</span> <span class="mi">2020</span>
|
||||||
|
<span class="p">}</span>
|
||||||
|
<span class="p">]</span>
|
||||||
|
</code></pre></div>
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</article>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</main>
|
||||||
|
|
||||||
|
|
||||||
|
<footer class="md-footer">
|
||||||
|
|
||||||
|
<nav class="md-footer__inner md-grid" aria-label="Footer">
|
||||||
|
|
||||||
|
|
||||||
|
<a href="exploration.html" class="md-footer__link md-footer__link--prev" aria-label="Previous: Data Exploration Report" rel="prev">
|
||||||
|
<div class="md-footer__button md-icon">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12z"/></svg>
|
||||||
|
</div>
|
||||||
|
<div class="md-footer__title">
|
||||||
|
<div class="md-ellipsis">
|
||||||
|
<span class="md-footer__direction">
|
||||||
|
Previous
|
||||||
|
</span>
|
||||||
|
Data Exploration Report
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<a href="approach.html" class="md-footer__link md-footer__link--next" aria-label="Next: Approach" rel="next">
|
||||||
|
<div class="md-footer__title">
|
||||||
|
<div class="md-ellipsis">
|
||||||
|
<span class="md-footer__direction">
|
||||||
|
Next
|
||||||
|
</span>
|
||||||
|
Approach
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="md-footer__button md-icon">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M4 11v2h12l-5.5 5.5 1.42 1.42L19.84 12l-7.92-7.92L10.5 5.5 16 11H4z"/></svg>
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
<div class="md-footer-meta md-typeset">
|
||||||
|
<div class="md-footer-meta__inner md-grid">
|
||||||
|
<div class="md-footer-copyright">
|
||||||
|
|
||||||
|
|
||||||
|
Made with
|
||||||
|
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
||||||
|
Material for MkDocs
|
||||||
|
</a>
|
||||||
|
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</footer>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
<div class="md-dialog" data-md-component="dialog">
|
||||||
|
<div class="md-dialog__inner md-typeset"></div>
|
||||||
|
</div>
|
||||||
|
<script id="__config" type="application/json">{"base": "..", "features": {"navigation.tabs": true}, "translations": {"clipboard.copy": "Copy to clipboard", "clipboard.copied": "Copied to clipboard", "search.config.lang": "en", "search.config.pipeline": "trimmer, stopWordFilter", "search.config.separator": "[\\s\\-]+", "search.placeholder": "Search", "search.result.placeholder": "Type to start searching", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.term.missing": "Missing", "select.version.title": "Select version"}, "search": "../assets/javascripts/workers/search.f8263e09.min.js", "version": null}</script>
|
||||||
|
|
||||||
|
|
||||||
|
<script src="../assets/javascripts/bundle.4fc53ad4.min.js"></script>
|
||||||
|
|
||||||
|
<script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
|
||||||
|
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||||||
|
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -207,6 +207,22 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="../dataflow/index.html" class="md-tabs__link">
|
||||||
|
DataFlow
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-tabs__item">
|
<li class="md-tabs__item">
|
||||||
<a href="../pandas-profiling/report.html" class="md-tabs__link">
|
<a href="../pandas-profiling/report.html" class="md-tabs__link">
|
||||||
Data Exploration Report
|
Data Exploration Report
|
||||||
@@ -394,10 +410,144 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||||||
|
Data Exploration Report
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
<a href="exploration.html" class="md-nav__link md-nav__link--active">
|
<a href="exploration.html" class="md-nav__link md-nav__link--active">
|
||||||
Data Exploration Report
|
Data Exploration Report
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
|
|
||||||
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__title" for="__toc">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Table of contents
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#interesting-observations" class="md-nav__link">
|
||||||
|
Interesting observations
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="cleaning.html" class="md-nav__link">
|
||||||
|
Cleaning
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="approach.html" class="md-nav__link">
|
||||||
|
Approach
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="results.html" class="md-nav__link">
|
||||||
|
Results
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--nested">
|
||||||
|
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_3" type="checkbox" id="__nav_3" >
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link" for="__nav_3">
|
||||||
|
DataFlow
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="DataFlow" data-md-level="1">
|
||||||
|
<label class="md-nav__title" for="__nav_3">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
DataFlow
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../dataflow/index.html" class="md-nav__link">
|
||||||
|
Running on DataFlow
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../dataflow/scaling.html" class="md-nav__link">
|
||||||
|
Scaling to the Full DataSet
|
||||||
|
</a>
|
||||||
</li>
|
</li>
|
||||||
|
|
||||||
|
|
||||||
@@ -443,6 +593,21 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__title" for="__toc">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Table of contents
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#interesting-observations" class="md-nav__link">
|
||||||
|
Interesting observations
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
|
||||||
</nav>
|
</nav>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -459,9 +624,66 @@
|
|||||||
|
|
||||||
|
|
||||||
<h1 id="data-exploration-report">Data Exploration Report<a class="headerlink" href="#data-exploration-report" title="Permanent link">¶</a></h1>
|
<h1 id="data-exploration-report">Data Exploration Report<a class="headerlink" href="#data-exploration-report" title="Permanent link">¶</a></h1>
|
||||||
<p>A brief exploration was done on the <strong>full</strong> dataset using the module <code>pandas-profiling</code>. The module uses <code>pandas</code> to load a dataset and automatically produce quantile/descriptive statistics, common values, extreme values, skew, kurtosis etc.</p>
|
<p>A brief exploration was done on the <strong>full</strong> dataset using the module <code>pandas-profiling</code>. The module uses <code>pandas</code> to load a dataset and automatically produce quantile/descriptive statistics, common values, extreme values, skew, kurtosis etc. and produces a report <code>.html</code> file that can be viewed interatively in your browser.</p>
|
||||||
<p>The script used to generate this report is located in <code>./exploration/report.py</code>.</p>
|
<p>The script used to generate this report is located in <code>./exploration/report.py</code> and can be viewed below.</p>
|
||||||
|
<details>
|
||||||
|
<summary>report.py</summary>
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="kn">import</span> <span class="nn">pathlib</span>
|
||||||
|
|
||||||
|
<span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
|
||||||
|
<span class="kn">from</span> <span class="nn">pandas_profiling</span> <span class="kn">import</span> <span class="n">ProfileReport</span>
|
||||||
|
|
||||||
|
|
||||||
|
<span class="k">def</span> <span class="nf">main</span><span class="p">():</span>
|
||||||
|
<span class="n">input_file</span> <span class="o">=</span> <span class="p">(</span>
|
||||||
|
<span class="n">pathlib</span><span class="o">.</span><span class="n">Path</span><span class="p">(</span><span class="vm">__file__</span><span class="p">)</span><span class="o">.</span><span class="n">parents</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">/</span> <span class="s2">"data"</span> <span class="o">/</span> <span class="s2">"input"</span> <span class="o">/</span> <span class="s2">"pp-complete.csv"</span>
|
||||||
|
<span class="p">)</span>
|
||||||
|
<span class="k">with</span> <span class="n">input_file</span><span class="o">.</span><span class="n">open</span><span class="p">()</span> <span class="k">as</span> <span class="n">csv</span><span class="p">:</span>
|
||||||
|
<span class="n">df_report</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span>
|
||||||
|
<span class="n">csv</span><span class="p">,</span>
|
||||||
|
<span class="n">names</span><span class="o">=</span><span class="p">[</span>
|
||||||
|
<span class="s2">"transaction_id"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"price"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"date_of_transfer"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"postcode"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"property_type"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"old_new"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"duration"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"paon"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"saon"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"street"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"locality"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"town_city"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"district"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"county"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"ppd_category"</span><span class="p">,</span>
|
||||||
|
<span class="s2">"record_status"</span><span class="p">,</span>
|
||||||
|
<span class="p">],</span>
|
||||||
|
<span class="p">)</span>
|
||||||
|
<span class="n">profile</span> <span class="o">=</span> <span class="n">ProfileReport</span><span class="p">(</span><span class="n">df_report</span><span class="p">,</span> <span class="n">title</span><span class="o">=</span><span class="s2">"Price Paid Data"</span><span class="p">,</span> <span class="n">minimal</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||||||
|
<span class="n">profile</span><span class="o">.</span><span class="n">to_file</span><span class="p">(</span><span class="s2">"price_paid_data_report.html"</span><span class="p">)</span>
|
||||||
|
|
||||||
|
|
||||||
|
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">"__main__"</span><span class="p">:</span>
|
||||||
|
<span class="n">main</span><span class="p">()</span>
|
||||||
|
</code></pre></div>
|
||||||
|
</details>
|
||||||
|
|
||||||
<p>The report can be viewed by clicking the Data Exploration Report tab at the top of the page.</p>
|
<p>The report can be viewed by clicking the Data Exploration Report tab at the top of the page.</p>
|
||||||
|
<h2 id="interesting-observations">Interesting observations<a class="headerlink" href="#interesting-observations" title="Permanent link">¶</a></h2>
|
||||||
|
<p>When looking at the report we are looking for data quality and missing observations. The statistics are interesting to see but are largely irrelevant for this task.</p>
|
||||||
|
<p>The data overall looks very good for a dataset of its size (~27 million records). For important fields there are no missing values:</p>
|
||||||
|
<ul>
|
||||||
|
<li>Every row has a price.</li>
|
||||||
|
<li>Every row has a unique transaction ID.</li>
|
||||||
|
<li>Every row has a transaction date.</li>
|
||||||
|
</ul>
|
||||||
|
<p>Some fields that we will need are missing data:</p>
|
||||||
|
<ul>
|
||||||
|
<li>~42,000 (0.2%) are missing a Postcode.</li>
|
||||||
|
<li>~4,000 (<0.1%) are missing a PAON (primary addressable object name).</li>
|
||||||
|
<li>~412,000 (1.6%) are missing a Street Name.</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -497,13 +719,13 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
<a href="../pandas-profiling/report.html" class="md-footer__link md-footer__link--next" aria-label="Next: Data Exploration Report" rel="next">
|
<a href="cleaning.html" class="md-footer__link md-footer__link--next" aria-label="Next: Cleaning" rel="next">
|
||||||
<div class="md-footer__title">
|
<div class="md-footer__title">
|
||||||
<div class="md-ellipsis">
|
<div class="md-ellipsis">
|
||||||
<span class="md-footer__direction">
|
<span class="md-footer__direction">
|
||||||
Next
|
Next
|
||||||
</span>
|
</span>
|
||||||
Data Exploration Report
|
Cleaning
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="md-footer__button md-icon">
|
<div class="md-footer__button md-icon">
|
||||||
|
|||||||
@@ -207,6 +207,22 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="../dataflow/index.html" class="md-tabs__link">
|
||||||
|
DataFlow
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-tabs__item">
|
<li class="md-tabs__item">
|
||||||
<a href="../pandas-profiling/report.html" class="md-tabs__link">
|
<a href="../pandas-profiling/report.html" class="md-tabs__link">
|
||||||
Data Exploration Report
|
Data Exploration Report
|
||||||
@@ -403,6 +419,110 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="cleaning.html" class="md-nav__link">
|
||||||
|
Cleaning
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="approach.html" class="md-nav__link">
|
||||||
|
Approach
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="results.html" class="md-nav__link">
|
||||||
|
Results
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--nested">
|
||||||
|
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_3" type="checkbox" id="__nav_3" >
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link" for="__nav_3">
|
||||||
|
DataFlow
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="DataFlow" data-md-level="1">
|
||||||
|
<label class="md-nav__title" for="__nav_3">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
DataFlow
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../dataflow/index.html" class="md-nav__link">
|
||||||
|
Running on DataFlow
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../dataflow/scaling.html" class="md-nav__link">
|
||||||
|
Scaling to the Full DataSet
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
</li>
|
</li>
|
||||||
@@ -464,8 +584,6 @@
|
|||||||
<li>Data exploration</li>
|
<li>Data exploration</li>
|
||||||
<li>Cleaning the data</li>
|
<li>Cleaning the data</li>
|
||||||
<li>Interpreting the results</li>
|
<li>Interpreting the results</li>
|
||||||
<li>Deploying on GCP DataFlow</li>
|
|
||||||
<li>Improvements</li>
|
|
||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
776
discussion/results.html
Normal file
776
discussion/results.html
Normal file
@@ -0,0 +1,776 @@
|
|||||||
|
|
||||||
|
<!doctype html>
|
||||||
|
<html lang="en" class="no-js">
|
||||||
|
<head>
|
||||||
|
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<link rel="icon" href="../assets/images/favicon.png">
|
||||||
|
<meta name="generator" content="mkdocs-1.2.2, mkdocs-material-7.3.0">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<title>Results - The Street Group Technical Test</title>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<link rel="stylesheet" href="../assets/stylesheets/main.8b42a75e.min.css">
|
||||||
|
|
||||||
|
|
||||||
|
<link rel="stylesheet" href="../assets/stylesheets/palette.3f5d1f46.min.css">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<meta name="theme-color" content="#4051b5">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||||
|
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,400,400i,700%7CRoboto+Mono&display=fallback">
|
||||||
|
<style>:root{--md-text-font-family:"Roboto";--md-code-font-family:"Roboto Mono"}</style>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</head>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<body dir="ltr" data-md-color-scheme="" data-md-color-primary="indigo" data-md-color-accent="blue">
|
||||||
|
|
||||||
|
|
||||||
|
<script>function __prefix(e){return new URL("..",location).pathname+"."+e}function __get(e,t=localStorage){return JSON.parse(t.getItem(__prefix(e)))}</script>
|
||||||
|
|
||||||
|
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
||||||
|
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
||||||
|
<label class="md-overlay" for="__drawer"></label>
|
||||||
|
<div data-md-component="skip">
|
||||||
|
|
||||||
|
|
||||||
|
<a href="#results" class="md-skip">
|
||||||
|
Skip to content
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
<div data-md-component="announce">
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<header class="md-header" data-md-component="header">
|
||||||
|
<nav class="md-header__inner md-grid" aria-label="Header">
|
||||||
|
<a href=".." title="The Street Group Technical Test" class="md-header__button md-logo" aria-label="The Street Group Technical Test" data-md-component="logo">
|
||||||
|
|
||||||
|
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54z"/></svg>
|
||||||
|
|
||||||
|
</a>
|
||||||
|
<label class="md-header__button md-icon" for="__drawer">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2z"/></svg>
|
||||||
|
</label>
|
||||||
|
<div class="md-header__title" data-md-component="header-title">
|
||||||
|
<div class="md-header__ellipsis">
|
||||||
|
<div class="md-header__topic">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
The Street Group Technical Test
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div class="md-header__topic" data-md-component="header-topic">
|
||||||
|
<span class="md-ellipsis">
|
||||||
|
|
||||||
|
Results
|
||||||
|
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-header__button md-icon" for="__search">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5z"/></svg>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<div class="md-search" data-md-component="search" role="dialog">
|
||||||
|
<label class="md-search__overlay" for="__search"></label>
|
||||||
|
<div class="md-search__inner" role="search">
|
||||||
|
<form class="md-search__form" name="search">
|
||||||
|
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
|
||||||
|
<label class="md-search__icon md-icon" for="__search">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5z"/></svg>
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12z"/></svg>
|
||||||
|
</label>
|
||||||
|
<nav class="md-search__options" aria-label="Search">
|
||||||
|
|
||||||
|
<button type="reset" class="md-search__icon md-icon" aria-label="Clear" tabindex="-1">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41z"/></svg>
|
||||||
|
</button>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</form>
|
||||||
|
<div class="md-search__output">
|
||||||
|
<div class="md-search__scrollwrap" data-md-scrollfix>
|
||||||
|
<div class="md-search-result" data-md-component="search-result">
|
||||||
|
<div class="md-search-result__meta">
|
||||||
|
Initializing search
|
||||||
|
</div>
|
||||||
|
<ol class="md-search-result__list"></ol>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
<div class="md-header__source">
|
||||||
|
|
||||||
|
<a href="https://github.com/dtomlinson91/street_group_tech_test/" title="Go to repository" class="md-source" data-md-component="source">
|
||||||
|
<div class="md-source__icon md-icon">
|
||||||
|
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
|
||||||
|
</div>
|
||||||
|
<div class="md-source__repository">
|
||||||
|
GitHub
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<div class="md-container" data-md-component="container">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<nav class="md-tabs" aria-label="Tabs" data-md-component="tabs">
|
||||||
|
<div class="md-tabs__inner md-grid">
|
||||||
|
<ul class="md-tabs__list">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="../index.html" class="md-tabs__link">
|
||||||
|
Documentation
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="introduction.html" class="md-tabs__link md-tabs__link--active">
|
||||||
|
Discussion
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="../dataflow/index.html" class="md-tabs__link">
|
||||||
|
DataFlow
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="../pandas-profiling/report.html" class="md-tabs__link">
|
||||||
|
Data Exploration Report
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<main class="md-main" data-md-component="main">
|
||||||
|
<div class="md-main__inner md-grid">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
||||||
|
<div class="md-sidebar__scrollwrap">
|
||||||
|
<div class="md-sidebar__inner">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<nav class="md-nav md-nav--primary md-nav--lifted" aria-label="Navigation" data-md-level="0">
|
||||||
|
<label class="md-nav__title" for="__drawer">
|
||||||
|
<a href=".." title="The Street Group Technical Test" class="md-nav__button md-logo" aria-label="The Street Group Technical Test" data-md-component="logo">
|
||||||
|
|
||||||
|
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54z"/></svg>
|
||||||
|
|
||||||
|
</a>
|
||||||
|
The Street Group Technical Test
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<div class="md-nav__source">
|
||||||
|
|
||||||
|
<a href="https://github.com/dtomlinson91/street_group_tech_test/" title="Go to repository" class="md-source" data-md-component="source">
|
||||||
|
<div class="md-source__icon md-icon">
|
||||||
|
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
|
||||||
|
</div>
|
||||||
|
<div class="md-source__repository">
|
||||||
|
GitHub
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--nested">
|
||||||
|
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_1" type="checkbox" id="__nav_1" >
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link" for="__nav_1">
|
||||||
|
Documentation
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="Documentation" data-md-level="1">
|
||||||
|
<label class="md-nav__title" for="__nav_1">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Documentation
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../index.html" class="md-nav__link">
|
||||||
|
Welcome
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../documentation/installation.html" class="md-nav__link">
|
||||||
|
Installation
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../documentation/usage.html" class="md-nav__link">
|
||||||
|
Usage
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
||||||
|
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_2" type="checkbox" id="__nav_2" checked>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link" for="__nav_2">
|
||||||
|
Discussion
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="Discussion" data-md-level="1">
|
||||||
|
<label class="md-nav__title" for="__nav_2">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Discussion
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="introduction.html" class="md-nav__link">
|
||||||
|
Introduction
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="exploration.html" class="md-nav__link">
|
||||||
|
Data Exploration Report
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="cleaning.html" class="md-nav__link">
|
||||||
|
Cleaning
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="approach.html" class="md-nav__link">
|
||||||
|
Approach
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--active">
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="toc" type="checkbox" id="__toc">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||||||
|
Results
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<a href="results.html" class="md-nav__link md-nav__link--active">
|
||||||
|
Results
|
||||||
|
</a>
|
||||||
|
|
||||||
|
|
||||||
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__title" for="__toc">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Table of contents
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#readable_address" class="md-nav__link">
|
||||||
|
readable_address
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#property_transactions" class="md-nav__link">
|
||||||
|
property_transactions
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#latest_transaction_year" class="md-nav__link">
|
||||||
|
latest_transaction_year
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--nested">
|
||||||
|
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_3" type="checkbox" id="__nav_3" >
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link" for="__nav_3">
|
||||||
|
DataFlow
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="DataFlow" data-md-level="1">
|
||||||
|
<label class="md-nav__title" for="__nav_3">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
DataFlow
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../dataflow/index.html" class="md-nav__link">
|
||||||
|
Running on DataFlow
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../dataflow/scaling.html" class="md-nav__link">
|
||||||
|
Scaling to the Full DataSet
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../pandas-profiling/report.html" class="md-nav__link">
|
||||||
|
Data Exploration Report
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
||||||
|
<div class="md-sidebar__scrollwrap">
|
||||||
|
<div class="md-sidebar__inner">
|
||||||
|
|
||||||
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__title" for="__toc">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
Table of contents
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#readable_address" class="md-nav__link">
|
||||||
|
readable_address
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#property_transactions" class="md-nav__link">
|
||||||
|
property_transactions
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="#latest_transaction_year" class="md-nav__link">
|
||||||
|
latest_transaction_year
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</li>
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
</nav>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
<div class="md-content" data-md-component="content">
|
||||||
|
<article class="md-content__inner md-typeset">
|
||||||
|
|
||||||
|
|
||||||
|
<a href="https://github.com/dtomlinson91/street_group_tech_test/edit/master/docs/discussion/results.md" title="Edit this page" class="md-content__button md-icon">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25z"/></svg>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
|
||||||
|
<h1 id="results">Results<a class="headerlink" href="#results" title="Permanent link">¶</a></h1>
|
||||||
|
<p>The resulting output <code>.json</code> looks like (for the previous example using No. 1 <code>B90 3LA</code>):</p>
|
||||||
|
<div class="highlight"><pre><span></span><code><span class="p">[</span>
|
||||||
|
<span class="p">{</span>
|
||||||
|
<span class="nt">"property_id"</span><span class="p">:</span> <span class="s2">"fe205bfe66bc7f18c50c8f3d77ec3e30"</span><span class="p">,</span>
|
||||||
|
<span class="nt">"readable_address"</span><span class="p">:</span> <span class="s2">"1 VERSTONE ROAD\nSHIRLEY\nSOLIHULL\nWEST MIDLANDS\nB90 3LA"</span><span class="p">,</span>
|
||||||
|
<span class="nt">"flat_appartment"</span><span class="p">:</span> <span class="s2">""</span><span class="p">,</span>
|
||||||
|
<span class="nt">"builing"</span><span class="p">:</span> <span class="s2">""</span><span class="p">,</span>
|
||||||
|
<span class="nt">"number"</span><span class="p">:</span> <span class="s2">"1"</span><span class="p">,</span>
|
||||||
|
<span class="nt">"street"</span><span class="p">:</span> <span class="s2">"VERSTONE ROAD"</span><span class="p">,</span>
|
||||||
|
<span class="nt">"locality"</span><span class="p">:</span> <span class="s2">"SHIRLEY"</span><span class="p">,</span>
|
||||||
|
<span class="nt">"town"</span><span class="p">:</span> <span class="s2">"SOLIHULL"</span><span class="p">,</span>
|
||||||
|
<span class="nt">"district"</span><span class="p">:</span> <span class="s2">"SOLIHULL"</span><span class="p">,</span>
|
||||||
|
<span class="nt">"county"</span><span class="p">:</span> <span class="s2">"WEST MIDLANDS"</span><span class="p">,</span>
|
||||||
|
<span class="nt">"postcode"</span><span class="p">:</span> <span class="s2">"B90 3LA"</span><span class="p">,</span>
|
||||||
|
<span class="nt">"property_transactions"</span><span class="p">:</span> <span class="p">[</span>
|
||||||
|
<span class="p">{</span>
|
||||||
|
<span class="nt">"price"</span><span class="p">:</span> <span class="mi">317500</span><span class="p">,</span>
|
||||||
|
<span class="nt">"transaction_date"</span><span class="p">:</span> <span class="s2">"2020-11-13"</span><span class="p">,</span>
|
||||||
|
<span class="nt">"year"</span><span class="p">:</span> <span class="mi">2020</span>
|
||||||
|
<span class="p">}</span>
|
||||||
|
<span class="p">],</span>
|
||||||
|
<span class="nt">"latest_transaction_year"</span><span class="p">:</span> <span class="mi">2020</span>
|
||||||
|
<span class="p">}</span>
|
||||||
|
<span class="p">]</span>
|
||||||
|
</code></pre></div>
|
||||||
|
<p>The standard property information is included, we will briefly discuss the additional fields included in this output file.</p>
|
||||||
|
<h2 id="readable_address">readable_address<a class="headerlink" href="#readable_address" title="Permanent link">¶</a></h2>
|
||||||
|
<p>The components that make up the address in the dataset are often repetitive, with the locality, town/city, district and county often sharing the same result. This can result in hard to read addresses if we just stacked all the components sequentially.</p>
|
||||||
|
<p>The <code>readable_address</code> provides an easy to read address that strips this repetiveness out, by doing pairwise comparisons to each of the four components and applying a mask. The result is an address that could be served to the end user, or easily displayed on a page.</p>
|
||||||
|
<p>This saves any user having to apply the same logic to simply display the address somewhere, the full address of a property should be easy to read and easily accessible.</p>
|
||||||
|
<h2 id="property_transactions">property_transactions<a class="headerlink" href="#property_transactions" title="Permanent link">¶</a></h2>
|
||||||
|
<p>This array contains an object for each transaction for that property that has the price and year as an <code>int</code>, with the date having the <code>00:00</code> time stripped out.</p>
|
||||||
|
<h2 id="latest_transaction_year">latest_transaction_year<a class="headerlink" href="#latest_transaction_year" title="Permanent link">¶</a></h2>
|
||||||
|
<p>The date of the latest transaction is extracted from the array of <code>property_transactions</code> and placed in the top level of the <code>json</code> object. This allows any end user to easily search for properties that haven't been sold in a period of time, without having to write this logic themselves.</p>
|
||||||
|
<p>A consumer should be able to use this data to answer questions like:</p>
|
||||||
|
<ul>
|
||||||
|
<li>Give me all properties in the town of Solihull that haven't been sold in the past 10 years.</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</article>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</main>
|
||||||
|
|
||||||
|
|
||||||
|
<footer class="md-footer">
|
||||||
|
|
||||||
|
<nav class="md-footer__inner md-grid" aria-label="Footer">
|
||||||
|
|
||||||
|
|
||||||
|
<a href="approach.html" class="md-footer__link md-footer__link--prev" aria-label="Previous: Approach" rel="prev">
|
||||||
|
<div class="md-footer__button md-icon">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12z"/></svg>
|
||||||
|
</div>
|
||||||
|
<div class="md-footer__title">
|
||||||
|
<div class="md-ellipsis">
|
||||||
|
<span class="md-footer__direction">
|
||||||
|
Previous
|
||||||
|
</span>
|
||||||
|
Approach
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<a href="../dataflow/index.html" class="md-footer__link md-footer__link--next" aria-label="Next: Running on DataFlow" rel="next">
|
||||||
|
<div class="md-footer__title">
|
||||||
|
<div class="md-ellipsis">
|
||||||
|
<span class="md-footer__direction">
|
||||||
|
Next
|
||||||
|
</span>
|
||||||
|
Running on DataFlow
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="md-footer__button md-icon">
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M4 11v2h12l-5.5 5.5 1.42 1.42L19.84 12l-7.92-7.92L10.5 5.5 16 11H4z"/></svg>
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
<div class="md-footer-meta md-typeset">
|
||||||
|
<div class="md-footer-meta__inner md-grid">
|
||||||
|
<div class="md-footer-copyright">
|
||||||
|
|
||||||
|
|
||||||
|
Made with
|
||||||
|
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
||||||
|
Material for MkDocs
|
||||||
|
</a>
|
||||||
|
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</footer>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
<div class="md-dialog" data-md-component="dialog">
|
||||||
|
<div class="md-dialog__inner md-typeset"></div>
|
||||||
|
</div>
|
||||||
|
<script id="__config" type="application/json">{"base": "..", "features": {"navigation.tabs": true}, "translations": {"clipboard.copy": "Copy to clipboard", "clipboard.copied": "Copied to clipboard", "search.config.lang": "en", "search.config.pipeline": "trimmer, stopWordFilter", "search.config.separator": "[\\s\\-]+", "search.placeholder": "Search", "search.result.placeholder": "Type to start searching", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.term.missing": "Missing", "select.version.title": "Select version"}, "search": "../assets/javascripts/workers/search.f8263e09.min.js", "version": null}</script>
|
||||||
|
|
||||||
|
|
||||||
|
<script src="../assets/javascripts/bundle.4fc53ad4.min.js"></script>
|
||||||
|
|
||||||
|
<script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
|
||||||
|
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||||||
|
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -207,6 +207,22 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="../dataflow/index.html" class="md-tabs__link">
|
||||||
|
DataFlow
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-tabs__item">
|
<li class="md-tabs__item">
|
||||||
<a href="../pandas-profiling/report.html" class="md-tabs__link">
|
<a href="../pandas-profiling/report.html" class="md-tabs__link">
|
||||||
Data Exploration Report
|
Data Exploration Report
|
||||||
@@ -433,6 +449,110 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../discussion/cleaning.html" class="md-nav__link">
|
||||||
|
Cleaning
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../discussion/approach.html" class="md-nav__link">
|
||||||
|
Approach
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../discussion/results.html" class="md-nav__link">
|
||||||
|
Results
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--nested">
|
||||||
|
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_3" type="checkbox" id="__nav_3" >
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link" for="__nav_3">
|
||||||
|
DataFlow
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="DataFlow" data-md-level="1">
|
||||||
|
<label class="md-nav__title" for="__nav_3">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
DataFlow
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../dataflow/index.html" class="md-nav__link">
|
||||||
|
Running on DataFlow
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../dataflow/scaling.html" class="md-nav__link">
|
||||||
|
Scaling to the Full DataSet
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
</li>
|
</li>
|
||||||
|
|||||||
@@ -207,6 +207,22 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="../dataflow/index.html" class="md-tabs__link">
|
||||||
|
DataFlow
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-tabs__item">
|
<li class="md-tabs__item">
|
||||||
<a href="../pandas-profiling/report.html" class="md-tabs__link">
|
<a href="../pandas-profiling/report.html" class="md-tabs__link">
|
||||||
Data Exploration Report
|
Data Exploration Report
|
||||||
@@ -454,6 +470,110 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../discussion/cleaning.html" class="md-nav__link">
|
||||||
|
Cleaning
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../discussion/approach.html" class="md-nav__link">
|
||||||
|
Approach
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../discussion/results.html" class="md-nav__link">
|
||||||
|
Results
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--nested">
|
||||||
|
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_3" type="checkbox" id="__nav_3" >
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link" for="__nav_3">
|
||||||
|
DataFlow
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="DataFlow" data-md-level="1">
|
||||||
|
<label class="md-nav__title" for="__nav_3">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
DataFlow
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../dataflow/index.html" class="md-nav__link">
|
||||||
|
Running on DataFlow
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="../dataflow/scaling.html" class="md-nav__link">
|
||||||
|
Scaling to the Full DataSet
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
</li>
|
</li>
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 370 KiB |
120
index.html
120
index.html
@@ -207,6 +207,22 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-tabs__item">
|
||||||
|
<a href="dataflow/index.html" class="md-tabs__link">
|
||||||
|
DataFlow
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<li class="md-tabs__item">
|
<li class="md-tabs__item">
|
||||||
<a href="pandas-profiling/report.html" class="md-tabs__link">
|
<a href="pandas-profiling/report.html" class="md-tabs__link">
|
||||||
Data Exploration Report
|
Data Exploration Report
|
||||||
@@ -433,6 +449,110 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="discussion/cleaning.html" class="md-nav__link">
|
||||||
|
Cleaning
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="discussion/approach.html" class="md-nav__link">
|
||||||
|
Approach
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="discussion/results.html" class="md-nav__link">
|
||||||
|
Results
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item md-nav__item--nested">
|
||||||
|
|
||||||
|
|
||||||
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_3" type="checkbox" id="__nav_3" >
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<label class="md-nav__link" for="__nav_3">
|
||||||
|
DataFlow
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<nav class="md-nav" aria-label="DataFlow" data-md-level="1">
|
||||||
|
<label class="md-nav__title" for="__nav_3">
|
||||||
|
<span class="md-nav__icon md-icon"></span>
|
||||||
|
DataFlow
|
||||||
|
</label>
|
||||||
|
<ul class="md-nav__list" data-md-scrollfix>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="dataflow/index.html" class="md-nav__link">
|
||||||
|
Running on DataFlow
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<li class="md-nav__item">
|
||||||
|
<a href="dataflow/scaling.html" class="md-nav__link">
|
||||||
|
Scaling to the Full DataSet
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
</li>
|
</li>
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
25
sitemap.xml
25
sitemap.xml
@@ -25,4 +25,29 @@
|
|||||||
<lastmod>2021-09-27</lastmod>
|
<lastmod>2021-09-27</lastmod>
|
||||||
<changefreq>daily</changefreq>
|
<changefreq>daily</changefreq>
|
||||||
</url>
|
</url>
|
||||||
|
<url>
|
||||||
|
<loc>None</loc>
|
||||||
|
<lastmod>2021-09-27</lastmod>
|
||||||
|
<changefreq>daily</changefreq>
|
||||||
|
</url>
|
||||||
|
<url>
|
||||||
|
<loc>None</loc>
|
||||||
|
<lastmod>2021-09-27</lastmod>
|
||||||
|
<changefreq>daily</changefreq>
|
||||||
|
</url>
|
||||||
|
<url>
|
||||||
|
<loc>None</loc>
|
||||||
|
<lastmod>2021-09-27</lastmod>
|
||||||
|
<changefreq>daily</changefreq>
|
||||||
|
</url>
|
||||||
|
<url>
|
||||||
|
<loc>None</loc>
|
||||||
|
<lastmod>2021-09-27</lastmod>
|
||||||
|
<changefreq>daily</changefreq>
|
||||||
|
</url>
|
||||||
|
<url>
|
||||||
|
<loc>None</loc>
|
||||||
|
<lastmod>2021-09-27</lastmod>
|
||||||
|
<changefreq>daily</changefreq>
|
||||||
|
</url>
|
||||||
</urlset>
|
</urlset>
|
||||||
BIN
sitemap.xml.gz
BIN
sitemap.xml.gz
Binary file not shown.
Reference in New Issue
Block a user