mirror of
https://github.com/dtomlinson91/street_group_tech_test
synced 2025-12-22 11:55:45 +00:00
962 lines
31 KiB
HTML
962 lines
31 KiB
HTML
|
|
<!doctype html>
|
|
<html lang="en" class="no-js">
|
|
<head>
|
|
|
|
<meta charset="utf-8">
|
|
<meta name="viewport" content="width=device-width,initial-scale=1">
|
|
|
|
|
|
|
|
|
|
<link rel="icon" href="../assets/images/favicon.png">
|
|
<meta name="generator" content="mkdocs-1.2.2, mkdocs-material-7.3.0">
|
|
|
|
|
|
|
|
<title>Cleaning - The Street Group Technical Test</title>
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="../assets/stylesheets/main.8b42a75e.min.css">
|
|
|
|
|
|
<link rel="stylesheet" href="../assets/stylesheets/palette.3f5d1f46.min.css">
|
|
|
|
|
|
|
|
<meta name="theme-color" content="#4051b5">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
|
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,400,400i,700%7CRoboto+Mono&display=fallback">
|
|
<style>:root{--md-text-font-family:"Roboto";--md-code-font-family:"Roboto Mono"}</style>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
</head>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<body dir="ltr" data-md-color-scheme="" data-md-color-primary="indigo" data-md-color-accent="blue">
|
|
|
|
|
|
<script>function __prefix(e){return new URL("..",location).pathname+"."+e}function __get(e,t=localStorage){return JSON.parse(t.getItem(__prefix(e)))}</script>
|
|
|
|
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
|
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
|
<label class="md-overlay" for="__drawer"></label>
|
|
<div data-md-component="skip">
|
|
|
|
|
|
<a href="#cleaning" class="md-skip">
|
|
Skip to content
|
|
</a>
|
|
|
|
</div>
|
|
<div data-md-component="announce">
|
|
|
|
</div>
|
|
|
|
|
|
|
|
<header class="md-header" data-md-component="header">
|
|
<nav class="md-header__inner md-grid" aria-label="Header">
|
|
<a href=".." title="The Street Group Technical Test" class="md-header__button md-logo" aria-label="The Street Group Technical Test" data-md-component="logo">
|
|
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54z"/></svg>
|
|
|
|
</a>
|
|
<label class="md-header__button md-icon" for="__drawer">
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2z"/></svg>
|
|
</label>
|
|
<div class="md-header__title" data-md-component="header-title">
|
|
<div class="md-header__ellipsis">
|
|
<div class="md-header__topic">
|
|
<span class="md-ellipsis">
|
|
The Street Group Technical Test
|
|
</span>
|
|
</div>
|
|
<div class="md-header__topic" data-md-component="header-topic">
|
|
<span class="md-ellipsis">
|
|
|
|
Cleaning
|
|
|
|
</span>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
|
|
<label class="md-header__button md-icon" for="__search">
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5z"/></svg>
|
|
</label>
|
|
|
|
<div class="md-search" data-md-component="search" role="dialog">
|
|
<label class="md-search__overlay" for="__search"></label>
|
|
<div class="md-search__inner" role="search">
|
|
<form class="md-search__form" name="search">
|
|
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
|
|
<label class="md-search__icon md-icon" for="__search">
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5z"/></svg>
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12z"/></svg>
|
|
</label>
|
|
<nav class="md-search__options" aria-label="Search">
|
|
|
|
<button type="reset" class="md-search__icon md-icon" aria-label="Clear" tabindex="-1">
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41z"/></svg>
|
|
</button>
|
|
</nav>
|
|
|
|
</form>
|
|
<div class="md-search__output">
|
|
<div class="md-search__scrollwrap" data-md-scrollfix>
|
|
<div class="md-search-result" data-md-component="search-result">
|
|
<div class="md-search-result__meta">
|
|
Initializing search
|
|
</div>
|
|
<ol class="md-search-result__list"></ol>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
<div class="md-header__source">
|
|
|
|
<a href="https://github.com/dtomlinson91/street_group_tech_test/" title="Go to repository" class="md-source" data-md-component="source">
|
|
<div class="md-source__icon md-icon">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
|
|
</div>
|
|
<div class="md-source__repository">
|
|
GitHub
|
|
</div>
|
|
</a>
|
|
</div>
|
|
|
|
</nav>
|
|
|
|
</header>
|
|
|
|
<div class="md-container" data-md-component="container">
|
|
|
|
|
|
|
|
|
|
|
|
<nav class="md-tabs" aria-label="Tabs" data-md-component="tabs">
|
|
<div class="md-tabs__inner md-grid">
|
|
<ul class="md-tabs__list">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-tabs__item">
|
|
<a href="../index.html" class="md-tabs__link">
|
|
Documentation
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-tabs__item">
|
|
<a href="introduction.html" class="md-tabs__link md-tabs__link--active">
|
|
Discussion
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-tabs__item">
|
|
<a href="../dataflow/index.html" class="md-tabs__link">
|
|
DataFlow
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-tabs__item">
|
|
<a href="../pandas-profiling/report.html" class="md-tabs__link">
|
|
Data Exploration Report
|
|
</a>
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
</div>
|
|
</nav>
|
|
|
|
|
|
|
|
<main class="md-main" data-md-component="main">
|
|
<div class="md-main__inner md-grid">
|
|
|
|
|
|
|
|
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
|
<div class="md-sidebar__scrollwrap">
|
|
<div class="md-sidebar__inner">
|
|
|
|
|
|
|
|
|
|
|
|
<nav class="md-nav md-nav--primary md-nav--lifted" aria-label="Navigation" data-md-level="0">
|
|
<label class="md-nav__title" for="__drawer">
|
|
<a href=".." title="The Street Group Technical Test" class="md-nav__button md-logo" aria-label="The Street Group Technical Test" data-md-component="logo">
|
|
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54z"/></svg>
|
|
|
|
</a>
|
|
The Street Group Technical Test
|
|
</label>
|
|
|
|
<div class="md-nav__source">
|
|
|
|
<a href="https://github.com/dtomlinson91/street_group_tech_test/" title="Go to repository" class="md-source" data-md-component="source">
|
|
<div class="md-source__icon md-icon">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
|
|
</div>
|
|
<div class="md-source__repository">
|
|
GitHub
|
|
</div>
|
|
</a>
|
|
</div>
|
|
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_1" type="checkbox" id="__nav_1" >
|
|
|
|
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_1">
|
|
Documentation
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" aria-label="Documentation" data-md-level="1">
|
|
<label class="md-nav__title" for="__nav_1">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
Documentation
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../index.html" class="md-nav__link">
|
|
Welcome
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../documentation/installation.html" class="md-nav__link">
|
|
Installation
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../documentation/usage.html" class="md-nav__link">
|
|
Usage
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_2" type="checkbox" id="__nav_2" checked>
|
|
|
|
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_2">
|
|
Discussion
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" aria-label="Discussion" data-md-level="1">
|
|
<label class="md-nav__title" for="__nav_2">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
Discussion
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="introduction.html" class="md-nav__link">
|
|
Introduction
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="exploration.html" class="md-nav__link">
|
|
Data Exploration Report
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--active">
|
|
|
|
<input class="md-nav__toggle md-toggle" data-md-toggle="toc" type="checkbox" id="__toc">
|
|
|
|
|
|
|
|
|
|
|
|
<label class="md-nav__link md-nav__link--active" for="__toc">
|
|
Cleaning
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<a href="cleaning.html" class="md-nav__link md-nav__link--active">
|
|
Cleaning
|
|
</a>
|
|
|
|
|
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<label class="md-nav__title" for="__toc">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
Table of contents
|
|
</label>
|
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#uniquely-identify-a-property" class="md-nav__link">
|
|
Uniquely identify a property.
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Uniquely identify a property.">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#postcode" class="md-nav__link">
|
|
Postcode
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#paonsaon" class="md-nav__link">
|
|
PAON/SAON
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#unneeded-columns" class="md-nav__link">
|
|
Unneeded columns
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#general-cleaning" class="md-nav__link">
|
|
General cleaning
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="General cleaning">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#upper-case" class="md-nav__link">
|
|
Upper case
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#strip-leadingtrailing-whitespace" class="md-nav__link">
|
|
Strip leading/trailing whitespace
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#repeated-rows" class="md-nav__link">
|
|
Repeated rows
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="approach.html" class="md-nav__link">
|
|
Approach
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="results.html" class="md-nav__link">
|
|
Results
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_3" type="checkbox" id="__nav_3" >
|
|
|
|
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_3">
|
|
DataFlow
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" aria-label="DataFlow" data-md-level="1">
|
|
<label class="md-nav__title" for="__nav_3">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
DataFlow
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../dataflow/index.html" class="md-nav__link">
|
|
Running on DataFlow
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../dataflow/scaling.html" class="md-nav__link">
|
|
Scaling to the Full DataSet
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../pandas-profiling/report.html" class="md-nav__link">
|
|
Data Exploration Report
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
|
|
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
|
<div class="md-sidebar__scrollwrap">
|
|
<div class="md-sidebar__inner">
|
|
|
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<label class="md-nav__title" for="__toc">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
Table of contents
|
|
</label>
|
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#uniquely-identify-a-property" class="md-nav__link">
|
|
Uniquely identify a property.
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Uniquely identify a property.">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#postcode" class="md-nav__link">
|
|
Postcode
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#paonsaon" class="md-nav__link">
|
|
PAON/SAON
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#unneeded-columns" class="md-nav__link">
|
|
Unneeded columns
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#general-cleaning" class="md-nav__link">
|
|
General cleaning
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="General cleaning">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#upper-case" class="md-nav__link">
|
|
Upper case
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#strip-leadingtrailing-whitespace" class="md-nav__link">
|
|
Strip leading/trailing whitespace
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#repeated-rows" class="md-nav__link">
|
|
Repeated rows
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
|
|
</nav>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
<div class="md-content" data-md-component="content">
|
|
<article class="md-content__inner md-typeset">
|
|
|
|
|
|
<a href="https://github.com/dtomlinson91/street_group_tech_test/edit/master/docs/discussion/cleaning.md" title="Edit this page" class="md-content__button md-icon">
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25z"/></svg>
|
|
</a>
|
|
|
|
|
|
<h1 id="cleaning">Cleaning<a class="headerlink" href="#cleaning" title="Permanent link">¶</a></h1>
|
|
<p>In this page we discuss the cleaning stages and how best to prepare the data.</p>
|
|
<h2 id="uniquely-identify-a-property">Uniquely identify a property.<a class="headerlink" href="#uniquely-identify-a-property" title="Permanent link">¶</a></h2>
|
|
<p>To uniquely identify a property with the data we have it is enough to have a Postcode and the PAON (or SAON or combination of both).</p>
|
|
<h3 id="postcode">Postcode<a class="headerlink" href="#postcode" title="Permanent link">¶</a></h3>
|
|
<p>Because so few properties are missing a postcode (0.2% of all records) we will drop all rows that do not have one. We will drop some properties that could be identified uniquely with some more work, but the properties that are missing a postcode tend to be unusual/commercial/industrial (e.g a powerplant).</p>
|
|
<h3 id="paonsaon">PAON/SAON<a class="headerlink" href="#paonsaon" title="Permanent link">¶</a></h3>
|
|
<p>The PAON has 3 possible formats:</p>
|
|
<ul>
|
|
<li>The street number.</li>
|
|
<li>The building name.</li>
|
|
<li>The building name and street number (comma delimited).</li>
|
|
</ul>
|
|
<p>The SAON:</p>
|
|
<ul>
|
|
<li>Identifies the appartment/flat number for the building.</li>
|
|
<li>If the SAON is present (only 11.7% of values) then the PAON will either be<ul>
|
|
<li>The building name.</li>
|
|
<li>The building name and street number.</li>
|
|
</ul>
|
|
</li>
|
|
</ul>
|
|
<p>Because of the way the PAON and SOAN are defined, if any row is missing <strong>both</strong> of these columns we will drop it. As only having the postcode is not enough (generally speaking) to uniquely identify a property.</p>
|
|
<div class="admonition tip">
|
|
<p class="admonition-title">Tip</p>
|
|
<p>In a production environment we could send these rows to a sink table (in BigQuery for example), rather than drop them outright. Collecting these rows over time might show some patterns on how we can uniquely identify properties that are missing these fields.</p>
|
|
</div>
|
|
<p>We split the PAON as part of the cleaning stage. If the PAON contains a comma then it contains the building name and street number. We keep the street number in the same position as the PAON and insert the building name as a new column at the end of the row. If the PAON does not contain a comma we insert a blank column at the end to keep the number of columns in the PCollection consistent.</p>
|
|
<h3 id="unneeded-columns">Unneeded columns<a class="headerlink" href="#unneeded-columns" title="Permanent link">¶</a></h3>
|
|
<p>To try keep computation costs/time down, I decided to drop the categorical columns provided. These include:</p>
|
|
<ul>
|
|
<li>Property Type.</li>
|
|
<li>Old/New.</li>
|
|
<li>Duration.</li>
|
|
<li>PPD Category Type.</li>
|
|
<li>Record Status - monthly file only.</li>
|
|
</ul>
|
|
<p>Initially I was attempting to work against the full dataset so dropping these columns would make a difference in the amount of data that needs processing.</p>
|
|
<p>These columns are also not consistent. E.g the property <code>63</code> <code>B16, 0AE</code> has three transactions. Two of these transactions have a property type of <code>Other</code> and one transaction has a property type of <code>Terraced</code>.</p>
|
|
<p>These columns do provide some relevant information (old/new, duration, property type) and these could be included back into the pipeline fairly easily. Due to time constraints I was unable to make this change.</p>
|
|
<p>In addition, I also dropped the transaction unique identifier column. I wanted the IDs calculated in the pipeline to be consistent in format, and hashing a string (md5) isn't that expensive to calculate with complexity <span class="arithmatex">\(\mathcal{O}(n)\)</span>.</p>
|
|
<h3 id="general-cleaning">General cleaning<a class="headerlink" href="#general-cleaning" title="Permanent link">¶</a></h3>
|
|
<h4 id="upper-case">Upper case<a class="headerlink" href="#upper-case" title="Permanent link">¶</a></h4>
|
|
<p>As all strings in the dataset are upper case, we convert everything in the row to upper case to enforce consistency across the dataset.</p>
|
|
<h4 id="strip-leadingtrailing-whitespace">Strip leading/trailing whitespace<a class="headerlink" href="#strip-leadingtrailing-whitespace" title="Permanent link">¶</a></h4>
|
|
<p>We strip all leading/trailing whitespace from each column to enforce consistency.</p>
|
|
<h4 id="repeated-rows">Repeated rows<a class="headerlink" href="#repeated-rows" title="Permanent link">¶</a></h4>
|
|
<p>Some of the data is repeated:</p>
|
|
<ul>
|
|
<li>Some rows repeated, with the same date + price + address information but with a unique transaction id.</li>
|
|
</ul>
|
|
<details>
|
|
<summary>Example (PCollection)</summary>
|
|
|
|
<div class="highlight"><pre><span></span><code><span class="p">[</span>
|
|
<span class="p">{</span>
|
|
<span class="nt">"fd4634faec47c29de40bbf7840723b41"</span><span class="p">:</span> <span class="p">[</span>
|
|
<span class="s2">"317500"</span><span class="p">,</span>
|
|
<span class="s2">"2020-11-13 00:00"</span><span class="p">,</span>
|
|
<span class="s2">"B90 3LA"</span><span class="p">,</span>
|
|
<span class="s2">"1"</span><span class="p">,</span>
|
|
<span class="s2">""</span><span class="p">,</span>
|
|
<span class="s2">"VERSTONE ROAD"</span><span class="p">,</span>
|
|
<span class="s2">"SHIRLEY"</span><span class="p">,</span>
|
|
<span class="s2">"SOLIHULL"</span><span class="p">,</span>
|
|
<span class="s2">"SOLIHULL"</span><span class="p">,</span>
|
|
<span class="s2">"WEST MIDLANDS"</span><span class="p">,</span>
|
|
<span class="s2">""</span>
|
|
<span class="p">]</span>
|
|
<span class="p">},</span>
|
|
<span class="p">{</span>
|
|
<span class="nt">"fd4634faec47c29de40bbf7840723b41"</span><span class="p">:</span> <span class="p">[</span>
|
|
<span class="s2">"317500"</span><span class="p">,</span>
|
|
<span class="s2">"2020-11-13 00:00"</span><span class="p">,</span>
|
|
<span class="s2">"B90 3LA"</span><span class="p">,</span>
|
|
<span class="s2">"1"</span><span class="p">,</span>
|
|
<span class="s2">""</span><span class="p">,</span>
|
|
<span class="s2">"VERSTONE ROAD"</span><span class="p">,</span>
|
|
<span class="s2">"SHIRLEY"</span><span class="p">,</span>
|
|
<span class="s2">"SOLIHULL"</span><span class="p">,</span>
|
|
<span class="s2">"SOLIHULL"</span><span class="p">,</span>
|
|
<span class="s2">"WEST MIDLANDS"</span><span class="p">,</span>
|
|
<span class="s2">""</span>
|
|
<span class="p">]</span>
|
|
<span class="p">}</span>
|
|
<span class="p">]</span>
|
|
</code></pre></div>
|
|
|
|
</details>
|
|
|
|
<p>These rows will be deduplicated as part of the pipeline.</p>
|
|
<ul>
|
|
<li>Some rows have the same date + address information, but different prices.</li>
|
|
</ul>
|
|
<p>It would be very unusual to see multiple transactions on the same date for the same property. One reason could be that there was a data entry error, resulting in two different transactions with only one being the real price. As the date column does not contain the time (it is fixed at <code>00:00</code>) it is impossible to tell.</p>
|
|
<p>Another reason could be missing building/flat/appartment information in this entry.</p>
|
|
<p>We <strong>keep</strong> these in the data, resulting in some properties having multiple transactions with different prices on the same date. Without a time or more information to go on, it is difficult to see how these could be filtered out.</p>
|
|
<details>
|
|
<summary>Example (Output)</summary>
|
|
|
|
<div class="highlight"><pre><span></span><code><span class="p">[</span>
|
|
<span class="p">{</span>
|
|
<span class="nt">"property_id"</span><span class="p">:</span> <span class="s2">"20d5c335c8d822a40baab0ecd57e92a4"</span><span class="p">,</span>
|
|
<span class="nt">"readable_address"</span><span class="p">:</span> <span class="s2">"53 PAVENHAM DRIVE\nBIRMINGHAM\nWEST MIDLANDS\nB5 7TN"</span><span class="p">,</span>
|
|
<span class="nt">"flat_appartment"</span><span class="p">:</span> <span class="s2">""</span><span class="p">,</span>
|
|
<span class="nt">"builing"</span><span class="p">:</span> <span class="s2">""</span><span class="p">,</span>
|
|
<span class="nt">"number"</span><span class="p">:</span> <span class="s2">"53"</span><span class="p">,</span>
|
|
<span class="nt">"street"</span><span class="p">:</span> <span class="s2">"PAVENHAM DRIVE"</span><span class="p">,</span>
|
|
<span class="nt">"locality"</span><span class="p">:</span> <span class="s2">""</span><span class="p">,</span>
|
|
<span class="nt">"town"</span><span class="p">:</span> <span class="s2">"BIRMINGHAM"</span><span class="p">,</span>
|
|
<span class="nt">"district"</span><span class="p">:</span> <span class="s2">"BIRMINGHAM"</span><span class="p">,</span>
|
|
<span class="nt">"county"</span><span class="p">:</span> <span class="s2">"WEST MIDLANDS"</span><span class="p">,</span>
|
|
<span class="nt">"postcode"</span><span class="p">:</span> <span class="s2">"B5 7TN"</span><span class="p">,</span>
|
|
<span class="nt">"property_transactions"</span><span class="p">:</span> <span class="p">[</span>
|
|
<span class="p">{</span>
|
|
<span class="nt">"price"</span><span class="p">:</span> <span class="mi">270000</span><span class="p">,</span>
|
|
<span class="nt">"transaction_date"</span><span class="p">:</span> <span class="s2">"2020-04-23"</span><span class="p">,</span>
|
|
<span class="nt">"year"</span><span class="p">:</span> <span class="mi">2020</span>
|
|
<span class="p">},</span>
|
|
<span class="p">{</span>
|
|
<span class="nt">"price"</span><span class="p">:</span> <span class="mi">364000</span><span class="p">,</span>
|
|
<span class="nt">"transaction_date"</span><span class="p">:</span> <span class="s2">"2020-04-23"</span><span class="p">,</span>
|
|
<span class="nt">"year"</span><span class="p">:</span> <span class="mi">2020</span>
|
|
<span class="p">}</span>
|
|
<span class="p">],</span>
|
|
<span class="nt">"latest_transaction_year"</span><span class="p">:</span> <span class="mi">2020</span>
|
|
<span class="p">}</span>
|
|
<span class="p">]</span>
|
|
</code></pre></div>
|
|
|
|
</details>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
</article>
|
|
</div>
|
|
</div>
|
|
|
|
</main>
|
|
|
|
|
|
<footer class="md-footer">
|
|
|
|
<nav class="md-footer__inner md-grid" aria-label="Footer">
|
|
|
|
|
|
<a href="exploration.html" class="md-footer__link md-footer__link--prev" aria-label="Previous: Data Exploration Report" rel="prev">
|
|
<div class="md-footer__button md-icon">
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12z"/></svg>
|
|
</div>
|
|
<div class="md-footer__title">
|
|
<div class="md-ellipsis">
|
|
<span class="md-footer__direction">
|
|
Previous
|
|
</span>
|
|
Data Exploration Report
|
|
</div>
|
|
</div>
|
|
</a>
|
|
|
|
|
|
|
|
<a href="approach.html" class="md-footer__link md-footer__link--next" aria-label="Next: Approach" rel="next">
|
|
<div class="md-footer__title">
|
|
<div class="md-ellipsis">
|
|
<span class="md-footer__direction">
|
|
Next
|
|
</span>
|
|
Approach
|
|
</div>
|
|
</div>
|
|
<div class="md-footer__button md-icon">
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M4 11v2h12l-5.5 5.5 1.42 1.42L19.84 12l-7.92-7.92L10.5 5.5 16 11H4z"/></svg>
|
|
</div>
|
|
</a>
|
|
|
|
</nav>
|
|
|
|
<div class="md-footer-meta md-typeset">
|
|
<div class="md-footer-meta__inner md-grid">
|
|
<div class="md-footer-copyright">
|
|
|
|
|
|
Made with
|
|
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
|
Material for MkDocs
|
|
</a>
|
|
|
|
|
|
</div>
|
|
|
|
</div>
|
|
</div>
|
|
</footer>
|
|
|
|
</div>
|
|
<div class="md-dialog" data-md-component="dialog">
|
|
<div class="md-dialog__inner md-typeset"></div>
|
|
</div>
|
|
<script id="__config" type="application/json">{"base": "..", "features": {"navigation.tabs": true}, "translations": {"clipboard.copy": "Copy to clipboard", "clipboard.copied": "Copied to clipboard", "search.config.lang": "en", "search.config.pipeline": "trimmer, stopWordFilter", "search.config.separator": "[\\s\\-]+", "search.placeholder": "Search", "search.result.placeholder": "Type to start searching", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.term.missing": "Missing", "select.version.title": "Select version"}, "search": "../assets/javascripts/workers/search.f8263e09.min.js", "version": null}</script>
|
|
|
|
|
|
<script src="../assets/javascripts/bundle.4fc53ad4.min.js"></script>
|
|
|
|
<script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
|
|
|
|
<script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
|
|
|
|
|
|
</body>
|
|
</html> |