mirror of
https://github.com/dtomlinson91/street_group_tech_test
synced 2025-12-22 11:55:45 +00:00
Deployed a53d791 with MkDocs version: 1.2.2
This commit is contained in:
@@ -0,0 +1,3 @@
|
||||
[ZoneTransfer]
|
||||
ZoneId=3
|
||||
HostUrl=about:internet
|
||||
BIN
dataflow/img/successful_dataflow_job.png
Normal file
BIN
dataflow/img/successful_dataflow_job.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 374 KiB |
803
dataflow/index.html
Normal file
803
dataflow/index.html
Normal file
@@ -0,0 +1,803 @@
|
||||
|
||||
<!doctype html>
|
||||
<html lang="en" class="no-js">
|
||||
<head>
|
||||
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="icon" href="../assets/images/favicon.png">
|
||||
<meta name="generator" content="mkdocs-1.2.2, mkdocs-material-7.3.0">
|
||||
|
||||
|
||||
|
||||
<title>Running on DataFlow - The Street Group Technical Test</title>
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../assets/stylesheets/main.8b42a75e.min.css">
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../assets/stylesheets/palette.3f5d1f46.min.css">
|
||||
|
||||
|
||||
|
||||
<meta name="theme-color" content="#4051b5">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,400,400i,700%7CRoboto+Mono&display=fallback">
|
||||
<style>:root{--md-text-font-family:"Roboto";--md-code-font-family:"Roboto Mono"}</style>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<body dir="ltr" data-md-color-scheme="" data-md-color-primary="indigo" data-md-color-accent="blue">
|
||||
|
||||
|
||||
<script>function __prefix(e){return new URL("..",location).pathname+"."+e}function __get(e,t=localStorage){return JSON.parse(t.getItem(__prefix(e)))}</script>
|
||||
|
||||
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
||||
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
||||
<label class="md-overlay" for="__drawer"></label>
|
||||
<div data-md-component="skip">
|
||||
|
||||
|
||||
<a href="#running-on-dataflow" class="md-skip">
|
||||
Skip to content
|
||||
</a>
|
||||
|
||||
</div>
|
||||
<div data-md-component="announce">
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<header class="md-header" data-md-component="header">
|
||||
<nav class="md-header__inner md-grid" aria-label="Header">
|
||||
<a href=".." title="The Street Group Technical Test" class="md-header__button md-logo" aria-label="The Street Group Technical Test" data-md-component="logo">
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54z"/></svg>
|
||||
|
||||
</a>
|
||||
<label class="md-header__button md-icon" for="__drawer">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2z"/></svg>
|
||||
</label>
|
||||
<div class="md-header__title" data-md-component="header-title">
|
||||
<div class="md-header__ellipsis">
|
||||
<div class="md-header__topic">
|
||||
<span class="md-ellipsis">
|
||||
The Street Group Technical Test
|
||||
</span>
|
||||
</div>
|
||||
<div class="md-header__topic" data-md-component="header-topic">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Running on DataFlow
|
||||
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<label class="md-header__button md-icon" for="__search">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5z"/></svg>
|
||||
</label>
|
||||
|
||||
<div class="md-search" data-md-component="search" role="dialog">
|
||||
<label class="md-search__overlay" for="__search"></label>
|
||||
<div class="md-search__inner" role="search">
|
||||
<form class="md-search__form" name="search">
|
||||
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
|
||||
<label class="md-search__icon md-icon" for="__search">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5z"/></svg>
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12z"/></svg>
|
||||
</label>
|
||||
<nav class="md-search__options" aria-label="Search">
|
||||
|
||||
<button type="reset" class="md-search__icon md-icon" aria-label="Clear" tabindex="-1">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41z"/></svg>
|
||||
</button>
|
||||
</nav>
|
||||
|
||||
</form>
|
||||
<div class="md-search__output">
|
||||
<div class="md-search__scrollwrap" data-md-scrollfix>
|
||||
<div class="md-search-result" data-md-component="search-result">
|
||||
<div class="md-search-result__meta">
|
||||
Initializing search
|
||||
</div>
|
||||
<ol class="md-search-result__list"></ol>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="md-header__source">
|
||||
|
||||
<a href="https://github.com/dtomlinson91/street_group_tech_test/" title="Go to repository" class="md-source" data-md-component="source">
|
||||
<div class="md-source__icon md-icon">
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
|
||||
</div>
|
||||
<div class="md-source__repository">
|
||||
GitHub
|
||||
</div>
|
||||
</a>
|
||||
</div>
|
||||
|
||||
</nav>
|
||||
|
||||
</header>
|
||||
|
||||
<div class="md-container" data-md-component="container">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<nav class="md-tabs" aria-label="Tabs" data-md-component="tabs">
|
||||
<div class="md-tabs__inner md-grid">
|
||||
<ul class="md-tabs__list">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-tabs__item">
|
||||
<a href="../index.html" class="md-tabs__link">
|
||||
Documentation
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-tabs__item">
|
||||
<a href="../discussion/introduction.html" class="md-tabs__link">
|
||||
Discussion
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-tabs__item">
|
||||
<a href="index.html" class="md-tabs__link md-tabs__link--active">
|
||||
DataFlow
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-tabs__item">
|
||||
<a href="../pandas-profiling/report.html" class="md-tabs__link">
|
||||
Data Exploration Report
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
</nav>
|
||||
|
||||
|
||||
|
||||
<main class="md-main" data-md-component="main">
|
||||
<div class="md-main__inner md-grid">
|
||||
|
||||
|
||||
|
||||
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
||||
<div class="md-sidebar__scrollwrap">
|
||||
<div class="md-sidebar__inner">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--primary md-nav--lifted" aria-label="Navigation" data-md-level="0">
|
||||
<label class="md-nav__title" for="__drawer">
|
||||
<a href=".." title="The Street Group Technical Test" class="md-nav__button md-logo" aria-label="The Street Group Technical Test" data-md-component="logo">
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54z"/></svg>
|
||||
|
||||
</a>
|
||||
The Street Group Technical Test
|
||||
</label>
|
||||
|
||||
<div class="md-nav__source">
|
||||
|
||||
<a href="https://github.com/dtomlinson91/street_group_tech_test/" title="Go to repository" class="md-source" data-md-component="source">
|
||||
<div class="md-source__icon md-icon">
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
|
||||
</div>
|
||||
<div class="md-source__repository">
|
||||
GitHub
|
||||
</div>
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--nested">
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_1" type="checkbox" id="__nav_1" >
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_1">
|
||||
Documentation
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" aria-label="Documentation" data-md-level="1">
|
||||
<label class="md-nav__title" for="__nav_1">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Documentation
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../index.html" class="md-nav__link">
|
||||
Welcome
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../documentation/installation.html" class="md-nav__link">
|
||||
Installation
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../documentation/usage.html" class="md-nav__link">
|
||||
Usage
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--nested">
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_2" type="checkbox" id="__nav_2" >
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_2">
|
||||
Discussion
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" aria-label="Discussion" data-md-level="1">
|
||||
<label class="md-nav__title" for="__nav_2">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Discussion
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../discussion/introduction.html" class="md-nav__link">
|
||||
Introduction
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../discussion/exploration.html" class="md-nav__link">
|
||||
Data Exploration Report
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../discussion/cleaning.html" class="md-nav__link">
|
||||
Cleaning
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../discussion/approach.html" class="md-nav__link">
|
||||
Approach
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../discussion/results.html" class="md-nav__link">
|
||||
Results
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_3" type="checkbox" id="__nav_3" checked>
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_3">
|
||||
DataFlow
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" aria-label="DataFlow" data-md-level="1">
|
||||
<label class="md-nav__title" for="__nav_3">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
DataFlow
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--active">
|
||||
|
||||
<input class="md-nav__toggle md-toggle" data-md-toggle="toc" type="checkbox" id="__toc">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||||
Running on DataFlow
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<a href="index.html" class="md-nav__link md-nav__link--active">
|
||||
Running on DataFlow
|
||||
</a>
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__title" for="__toc">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Table of contents
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#prerequisites" class="md-nav__link">
|
||||
Prerequisites
|
||||
</a>
|
||||
|
||||
<nav class="md-nav" aria-label="Prerequisites">
|
||||
<ul class="md-nav__list">
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#cloud-storage" class="md-nav__link">
|
||||
Cloud Storage
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#vpc" class="md-nav__link">
|
||||
VPC
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#command" class="md-nav__link">
|
||||
Command
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="scaling.html" class="md-nav__link">
|
||||
Scaling to the Full DataSet
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../pandas-profiling/report.html" class="md-nav__link">
|
||||
Data Exploration Report
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
||||
<div class="md-sidebar__scrollwrap">
|
||||
<div class="md-sidebar__inner">
|
||||
|
||||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__title" for="__toc">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Table of contents
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#prerequisites" class="md-nav__link">
|
||||
Prerequisites
|
||||
</a>
|
||||
|
||||
<nav class="md-nav" aria-label="Prerequisites">
|
||||
<ul class="md-nav__list">
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#cloud-storage" class="md-nav__link">
|
||||
Cloud Storage
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#vpc" class="md-nav__link">
|
||||
VPC
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#command" class="md-nav__link">
|
||||
Command
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="md-content" data-md-component="content">
|
||||
<article class="md-content__inner md-typeset">
|
||||
|
||||
|
||||
<a href="https://github.com/dtomlinson91/street_group_tech_test/edit/master/docs/dataflow/index.md" title="Edit this page" class="md-content__button md-icon">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25z"/></svg>
|
||||
</a>
|
||||
|
||||
|
||||
<h1 id="running-on-dataflow">Running on DataFlow<a class="headerlink" href="#running-on-dataflow" title="Permanent link">¶</a></h1>
|
||||
<p>The pipeline runs as is on GCP DataFlow. The following documents how I deployed to my personal GCP account but the approach may vary depending on project/account in GCP.</p>
|
||||
<h2 id="prerequisites">Prerequisites<a class="headerlink" href="#prerequisites" title="Permanent link">¶</a></h2>
|
||||
<h3 id="cloud-storage">Cloud Storage<a class="headerlink" href="#cloud-storage" title="Permanent link">¶</a></h3>
|
||||
<ul>
|
||||
<li>A Cloud Storage bucket with the following structure:</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre><span></span><code>./input
|
||||
./output
|
||||
./tmp
|
||||
</code></pre></div>
|
||||
<ul>
|
||||
<li>Place the input files into the <code>./input</code> directory in the bucket.</li>
|
||||
</ul>
|
||||
<h3 id="vpc">VPC<a class="headerlink" href="#vpc" title="Permanent link">¶</a></h3>
|
||||
<p>To get around public IP quotas I created a VPC in the <code>europe-west1</code> region that has <code>Private Google Access</code> turned to <code>ON</code>.</p>
|
||||
<h2 id="command">Command<a class="headerlink" href="#command" title="Permanent link">¶</a></h2>
|
||||
<div class="admonition tip">
|
||||
<p class="admonition-title">Tip</p>
|
||||
<p>We need to choose a <code>worker_machine_type</code> with sufficient memory to run the pipeline. As the pipeline uses a mapping table, and DataFlow autoscales on CPU and not memory usage, we need a machine with more ram than usual to ensure sufficient memory when running on one worker. For <code>pp-2020.csv</code> the type <code>n1-highmem-2</code> with 2vCPU and 13GB of ram was chosen and completed successfully in ~10 minutes using only 1 worker.</p>
|
||||
</div>
|
||||
<p>Assuming the <code>pp-2020.csv</code> file has been placed in the <code>./input</code> directory in the bucket you can run a command similar to:</p>
|
||||
<div class="admonition caution">
|
||||
<p class="admonition-title">Caution</p>
|
||||
<p>Use the command <code>python -m analyse_properties.main</code> as the entrypoint to the pipeline and not <code>analyse-properties</code> as the module isn't installed with poetry on the workers with the configuration below.</p>
|
||||
</div>
|
||||
<div class="highlight"><pre><span></span><code>python -m analyse_properties.main <span class="se">\</span>
|
||||
--runner DataflowRunner <span class="se">\</span>
|
||||
--project street-group <span class="se">\</span>
|
||||
--region europe-west1 <span class="se">\</span>
|
||||
--input gs://street-group-technical-test-dmot-euw1/input/pp-2020.csv <span class="se">\</span>
|
||||
--output gs://street-group-technical-test-dmot-euw1/output/pp-2020 <span class="se">\</span>
|
||||
--temp_location gs://street-group-technical-test-dmot-euw1/tmp <span class="se">\</span>
|
||||
--subnetwork<span class="o">=</span>https://www.googleapis.com/compute/v1/projects/street-group/regions/europe-west1/subnetworks/europe-west-1-dataflow <span class="se">\</span>
|
||||
--no_use_public_ips <span class="se">\</span>
|
||||
--worker_machine_type<span class="o">=</span>n1-highmem-2
|
||||
</code></pre></div>
|
||||
<p>The output file from this pipeline is publically available and can be downloaded <a href="https://storage.googleapis.com/street-group-technical-test-dmot-euw1/output/pp-2020-00000-of-00001.json">here</a>.</p>
|
||||
<p>The job graph for this pipeline is displayed below:</p>
|
||||
<p><img alt="JobGraph" src="img/successful_dataflow_job.png" /></p>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</article>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</main>
|
||||
|
||||
|
||||
<footer class="md-footer">
|
||||
|
||||
<nav class="md-footer__inner md-grid" aria-label="Footer">
|
||||
|
||||
|
||||
<a href="../discussion/results.html" class="md-footer__link md-footer__link--prev" aria-label="Previous: Results" rel="prev">
|
||||
<div class="md-footer__button md-icon">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12z"/></svg>
|
||||
</div>
|
||||
<div class="md-footer__title">
|
||||
<div class="md-ellipsis">
|
||||
<span class="md-footer__direction">
|
||||
Previous
|
||||
</span>
|
||||
Results
|
||||
</div>
|
||||
</div>
|
||||
</a>
|
||||
|
||||
|
||||
|
||||
<a href="scaling.html" class="md-footer__link md-footer__link--next" aria-label="Next: Scaling to the Full DataSet" rel="next">
|
||||
<div class="md-footer__title">
|
||||
<div class="md-ellipsis">
|
||||
<span class="md-footer__direction">
|
||||
Next
|
||||
</span>
|
||||
Scaling to the Full DataSet
|
||||
</div>
|
||||
</div>
|
||||
<div class="md-footer__button md-icon">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M4 11v2h12l-5.5 5.5 1.42 1.42L19.84 12l-7.92-7.92L10.5 5.5 16 11H4z"/></svg>
|
||||
</div>
|
||||
</a>
|
||||
|
||||
</nav>
|
||||
|
||||
<div class="md-footer-meta md-typeset">
|
||||
<div class="md-footer-meta__inner md-grid">
|
||||
<div class="md-footer-copyright">
|
||||
|
||||
|
||||
Made with
|
||||
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
||||
Material for MkDocs
|
||||
</a>
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
<div class="md-dialog" data-md-component="dialog">
|
||||
<div class="md-dialog__inner md-typeset"></div>
|
||||
</div>
|
||||
<script id="__config" type="application/json">{"base": "..", "features": {"navigation.tabs": true}, "translations": {"clipboard.copy": "Copy to clipboard", "clipboard.copied": "Copied to clipboard", "search.config.lang": "en", "search.config.pipeline": "trimmer, stopWordFilter", "search.config.separator": "[\\s\\-]+", "search.placeholder": "Search", "search.result.placeholder": "Type to start searching", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.term.missing": "Missing", "select.version.title": "Select version"}, "search": "../assets/javascripts/workers/search.f8263e09.min.js", "version": null}</script>
|
||||
|
||||
|
||||
<script src="../assets/javascripts/bundle.4fc53ad4.min.js"></script>
|
||||
|
||||
<script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
|
||||
|
||||
<script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
780
dataflow/scaling.html
Normal file
780
dataflow/scaling.html
Normal file
@@ -0,0 +1,780 @@
|
||||
|
||||
<!doctype html>
|
||||
<html lang="en" class="no-js">
|
||||
<head>
|
||||
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="icon" href="../assets/images/favicon.png">
|
||||
<meta name="generator" content="mkdocs-1.2.2, mkdocs-material-7.3.0">
|
||||
|
||||
|
||||
|
||||
<title>Scaling to the Full DataSet - The Street Group Technical Test</title>
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../assets/stylesheets/main.8b42a75e.min.css">
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../assets/stylesheets/palette.3f5d1f46.min.css">
|
||||
|
||||
|
||||
|
||||
<meta name="theme-color" content="#4051b5">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,400,400i,700%7CRoboto+Mono&display=fallback">
|
||||
<style>:root{--md-text-font-family:"Roboto";--md-code-font-family:"Roboto Mono"}</style>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<body dir="ltr" data-md-color-scheme="" data-md-color-primary="indigo" data-md-color-accent="blue">
|
||||
|
||||
|
||||
<script>function __prefix(e){return new URL("..",location).pathname+"."+e}function __get(e,t=localStorage){return JSON.parse(t.getItem(__prefix(e)))}</script>
|
||||
|
||||
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
||||
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
||||
<label class="md-overlay" for="__drawer"></label>
|
||||
<div data-md-component="skip">
|
||||
|
||||
|
||||
<a href="#scaling-to-the-full-dataset" class="md-skip">
|
||||
Skip to content
|
||||
</a>
|
||||
|
||||
</div>
|
||||
<div data-md-component="announce">
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<header class="md-header" data-md-component="header">
|
||||
<nav class="md-header__inner md-grid" aria-label="Header">
|
||||
<a href=".." title="The Street Group Technical Test" class="md-header__button md-logo" aria-label="The Street Group Technical Test" data-md-component="logo">
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54z"/></svg>
|
||||
|
||||
</a>
|
||||
<label class="md-header__button md-icon" for="__drawer">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2z"/></svg>
|
||||
</label>
|
||||
<div class="md-header__title" data-md-component="header-title">
|
||||
<div class="md-header__ellipsis">
|
||||
<div class="md-header__topic">
|
||||
<span class="md-ellipsis">
|
||||
The Street Group Technical Test
|
||||
</span>
|
||||
</div>
|
||||
<div class="md-header__topic" data-md-component="header-topic">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Scaling to the Full DataSet
|
||||
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<label class="md-header__button md-icon" for="__search">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5z"/></svg>
|
||||
</label>
|
||||
|
||||
<div class="md-search" data-md-component="search" role="dialog">
|
||||
<label class="md-search__overlay" for="__search"></label>
|
||||
<div class="md-search__inner" role="search">
|
||||
<form class="md-search__form" name="search">
|
||||
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
|
||||
<label class="md-search__icon md-icon" for="__search">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5z"/></svg>
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12z"/></svg>
|
||||
</label>
|
||||
<nav class="md-search__options" aria-label="Search">
|
||||
|
||||
<button type="reset" class="md-search__icon md-icon" aria-label="Clear" tabindex="-1">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41z"/></svg>
|
||||
</button>
|
||||
</nav>
|
||||
|
||||
</form>
|
||||
<div class="md-search__output">
|
||||
<div class="md-search__scrollwrap" data-md-scrollfix>
|
||||
<div class="md-search-result" data-md-component="search-result">
|
||||
<div class="md-search-result__meta">
|
||||
Initializing search
|
||||
</div>
|
||||
<ol class="md-search-result__list"></ol>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="md-header__source">
|
||||
|
||||
<a href="https://github.com/dtomlinson91/street_group_tech_test/" title="Go to repository" class="md-source" data-md-component="source">
|
||||
<div class="md-source__icon md-icon">
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
|
||||
</div>
|
||||
<div class="md-source__repository">
|
||||
GitHub
|
||||
</div>
|
||||
</a>
|
||||
</div>
|
||||
|
||||
</nav>
|
||||
|
||||
</header>
|
||||
|
||||
<div class="md-container" data-md-component="container">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<nav class="md-tabs" aria-label="Tabs" data-md-component="tabs">
|
||||
<div class="md-tabs__inner md-grid">
|
||||
<ul class="md-tabs__list">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-tabs__item">
|
||||
<a href="../index.html" class="md-tabs__link">
|
||||
Documentation
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-tabs__item">
|
||||
<a href="../discussion/introduction.html" class="md-tabs__link">
|
||||
Discussion
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-tabs__item">
|
||||
<a href="index.html" class="md-tabs__link md-tabs__link--active">
|
||||
DataFlow
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-tabs__item">
|
||||
<a href="../pandas-profiling/report.html" class="md-tabs__link">
|
||||
Data Exploration Report
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
</nav>
|
||||
|
||||
|
||||
|
||||
<main class="md-main" data-md-component="main">
|
||||
<div class="md-main__inner md-grid">
|
||||
|
||||
|
||||
|
||||
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
||||
<div class="md-sidebar__scrollwrap">
|
||||
<div class="md-sidebar__inner">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--primary md-nav--lifted" aria-label="Navigation" data-md-level="0">
|
||||
<label class="md-nav__title" for="__drawer">
|
||||
<a href=".." title="The Street Group Technical Test" class="md-nav__button md-logo" aria-label="The Street Group Technical Test" data-md-component="logo">
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54z"/></svg>
|
||||
|
||||
</a>
|
||||
The Street Group Technical Test
|
||||
</label>
|
||||
|
||||
<div class="md-nav__source">
|
||||
|
||||
<a href="https://github.com/dtomlinson91/street_group_tech_test/" title="Go to repository" class="md-source" data-md-component="source">
|
||||
<div class="md-source__icon md-icon">
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
|
||||
</div>
|
||||
<div class="md-source__repository">
|
||||
GitHub
|
||||
</div>
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--nested">
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_1" type="checkbox" id="__nav_1" >
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_1">
|
||||
Documentation
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" aria-label="Documentation" data-md-level="1">
|
||||
<label class="md-nav__title" for="__nav_1">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Documentation
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../index.html" class="md-nav__link">
|
||||
Welcome
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../documentation/installation.html" class="md-nav__link">
|
||||
Installation
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../documentation/usage.html" class="md-nav__link">
|
||||
Usage
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--nested">
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_2" type="checkbox" id="__nav_2" >
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_2">
|
||||
Discussion
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" aria-label="Discussion" data-md-level="1">
|
||||
<label class="md-nav__title" for="__nav_2">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Discussion
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../discussion/introduction.html" class="md-nav__link">
|
||||
Introduction
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../discussion/exploration.html" class="md-nav__link">
|
||||
Data Exploration Report
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../discussion/cleaning.html" class="md-nav__link">
|
||||
Cleaning
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../discussion/approach.html" class="md-nav__link">
|
||||
Approach
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../discussion/results.html" class="md-nav__link">
|
||||
Results
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_3" type="checkbox" id="__nav_3" checked>
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_3">
|
||||
DataFlow
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" aria-label="DataFlow" data-md-level="1">
|
||||
<label class="md-nav__title" for="__nav_3">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
DataFlow
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="index.html" class="md-nav__link">
|
||||
Running on DataFlow
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--active">
|
||||
|
||||
<input class="md-nav__toggle md-toggle" data-md-toggle="toc" type="checkbox" id="__toc">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||||
Scaling to the Full DataSet
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<a href="scaling.html" class="md-nav__link md-nav__link--active">
|
||||
Scaling to the Full DataSet
|
||||
</a>
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__title" for="__toc">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Table of contents
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#mapping-table" class="md-nav__link">
|
||||
Mapping table
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#patterns" class="md-nav__link">
|
||||
Patterns
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#solution" class="md-nav__link">
|
||||
Solution
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../pandas-profiling/report.html" class="md-nav__link">
|
||||
Data Exploration Report
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
||||
<div class="md-sidebar__scrollwrap">
|
||||
<div class="md-sidebar__inner">
|
||||
|
||||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__title" for="__toc">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Table of contents
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#mapping-table" class="md-nav__link">
|
||||
Mapping table
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#patterns" class="md-nav__link">
|
||||
Patterns
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#solution" class="md-nav__link">
|
||||
Solution
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="md-content" data-md-component="content">
|
||||
<article class="md-content__inner md-typeset">
|
||||
|
||||
|
||||
<a href="https://github.com/dtomlinson91/street_group_tech_test/edit/master/docs/dataflow/scaling.md" title="Edit this page" class="md-content__button md-icon">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25z"/></svg>
|
||||
</a>
|
||||
|
||||
|
||||
<h1 id="scaling-to-the-full-dataset">Scaling to the full DataSet<a class="headerlink" href="#scaling-to-the-full-dataset" title="Permanent link">¶</a></h1>
|
||||
<p>As is the pipeline will not run against the full dataset. But with a little work done to the existing pipeline I believe it is possible to work against the full dataset of ~27 million rows.</p>
|
||||
<h2 id="mapping-table">Mapping table<a class="headerlink" href="#mapping-table" title="Permanent link">¶</a></h2>
|
||||
<p>Using a mapping table as a side-input means that for the full dataset this table is going to be huge.</p>
|
||||
<p>Side inputs are stored in memory on the workers, with such a huge table the machines are going to quickly run out of available memory when autoscaling is applied.</p>
|
||||
<p>Running the pipeline against the full dataset resulted in the following error:</p>
|
||||
<div class="highlight"><pre><span></span><code>"Out of memory: Killed process 2042 (python) total-vm:28616496kB, anon-rss:25684136kB, file-rss:0kB, shmem-rss:0kB, UID:0 pgtables:51284kB oom_score_adj:900"
|
||||
</code></pre></div>
|
||||
<p>with the pipeline job failing to process anything and the rows being processed per/sec gradually falling to zero as the workers killed the Python process to try free up more memory. This resulted in autoscaling down (as the CPU decreased) and the entire pipeline stagnated.</p>
|
||||
<p>Using a higher tiered <code>worker_machine_type</code>, disabling autoscaling, and fixing the workers to the maximum number of vCPUs available to the quota results in pipeline options:</p>
|
||||
<div class="highlight"><pre><span></span><code>--worker_machine_type<span class="o">=</span>n1-highmem-8 <span class="se">\</span>
|
||||
--num_workers<span class="o">=</span><span class="m">3</span> <span class="se">\</span>
|
||||
--autoscaling_algorithm<span class="o">=</span>NONE
|
||||
</code></pre></div>
|
||||
<p>with 156GB of RAM available to the pipeline with 52GB on each worker.</p>
|
||||
<p>The pipeline was able to progress further until Python threw an error and the pipeline failed and shut down:</p>
|
||||
<div class="highlight"><pre><span></span><code>"Error message from worker: Traceback (most recent call last):
|
||||
File "/usr/local/lib/python3.7/site-packages/dataflow_worker/batchworker.py", line 651, in do_work
|
||||
work_executor.execute()
|
||||
...
|
||||
File "/usr/local/lib/python3.7/multiprocessing/connection.py", line 393, in _send_bytes
|
||||
header = struct.pack("!i", n)
|
||||
struct.error: 'i' format requires -2147483648 <= number <= 2147483647
|
||||
</code></pre></div>
|
||||
<p>The number 2147483647 being the maximum value for a 32bit integer.</p>
|
||||
<p>As the side-input needs to be pickled (or serialised), this tells us that the table is far too large to be pickled and passed to the other workers. No amount of CPU/Memory can fix the problem.</p>
|
||||
<h2 id="patterns">Patterns<a class="headerlink" href="#patterns" title="Permanent link">¶</a></h2>
|
||||
<p>Google have several patterns for large side-inputs which are documented here:</p>
|
||||
<ul>
|
||||
<li>Part 1 <a href="https://cloud.google.com/blog/products/data-analytics/guide-to-common-cloud-dataflow-use-case-patterns-part-1">https://cloud.google.com/blog/products/data-analytics/guide-to-common-cloud-dataflow-use-case-patterns-part-1</a></li>
|
||||
<li>Part 2 <a href="https://cloud.google.com/blog/products/data-analytics/guide-to-common-cloud-dataflow-use-case-patterns-part-2">https://cloud.google.com/blog/products/data-analytics/guide-to-common-cloud-dataflow-use-case-patterns-part-2</a></li>
|
||||
</ul>
|
||||
<h2 id="solution">Solution<a class="headerlink" href="#solution" title="Permanent link">¶</a></h2>
|
||||
<p>A possible solution would be to leverage BigQuery to store the results of the mapping table in as the pipeline progresses. We can make use of BigQueries array type to literally store the raw array as we process each row.</p>
|
||||
<p>In addition to creating the mapping table <code>(key, value)</code> pairs, we also save these pairs to BigQuery at this stage. We then yield the element as it is currently written to allow the subsequent stages to make use of this data.</p>
|
||||
<p>Remove the condense mapping table stage as it is no longer needed.</p>
|
||||
<p>Instead of using:</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="n">beam</span><span class="o">.</span><span class="n">FlatMap</span><span class="p">(</span>
|
||||
<span class="n">insert_data_for_id</span><span class="p">,</span> <span class="n">beam</span><span class="o">.</span><span class="n">pvalue</span><span class="o">.</span><span class="n">AsSingleton</span><span class="p">(</span><span class="n">mapping_table_condensed</span><span class="p">)</span>
|
||||
<span class="p">)</span>
|
||||
</code></pre></div>
|
||||
<p>to insert the results of the mapping table we write a new <code>DoFn</code> that takes the element, and for each <code>id_all_columns</code> in the array we make a call to BigQuery to get the array for this ID and insert it at this stage.</p>
|
||||
<p>Because each <code>id_all_columns</code> and its corresponding data is only used once, there would be no need to cache the results from BigQuery, however some work could be done to see if we could pull back more than one row at a time and cache these, saving time/costs in calls to BigQuery.</p>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</article>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</main>
|
||||
|
||||
|
||||
<footer class="md-footer">
|
||||
|
||||
<nav class="md-footer__inner md-grid" aria-label="Footer">
|
||||
|
||||
|
||||
<a href="index.html" class="md-footer__link md-footer__link--prev" aria-label="Previous: Running on DataFlow" rel="prev">
|
||||
<div class="md-footer__button md-icon">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12z"/></svg>
|
||||
</div>
|
||||
<div class="md-footer__title">
|
||||
<div class="md-ellipsis">
|
||||
<span class="md-footer__direction">
|
||||
Previous
|
||||
</span>
|
||||
Running on DataFlow
|
||||
</div>
|
||||
</div>
|
||||
</a>
|
||||
|
||||
|
||||
|
||||
<a href="../pandas-profiling/report.html" class="md-footer__link md-footer__link--next" aria-label="Next: Data Exploration Report" rel="next">
|
||||
<div class="md-footer__title">
|
||||
<div class="md-ellipsis">
|
||||
<span class="md-footer__direction">
|
||||
Next
|
||||
</span>
|
||||
Data Exploration Report
|
||||
</div>
|
||||
</div>
|
||||
<div class="md-footer__button md-icon">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M4 11v2h12l-5.5 5.5 1.42 1.42L19.84 12l-7.92-7.92L10.5 5.5 16 11H4z"/></svg>
|
||||
</div>
|
||||
</a>
|
||||
|
||||
</nav>
|
||||
|
||||
<div class="md-footer-meta md-typeset">
|
||||
<div class="md-footer-meta__inner md-grid">
|
||||
<div class="md-footer-copyright">
|
||||
|
||||
|
||||
Made with
|
||||
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
||||
Material for MkDocs
|
||||
</a>
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
<div class="md-dialog" data-md-component="dialog">
|
||||
<div class="md-dialog__inner md-typeset"></div>
|
||||
</div>
|
||||
<script id="__config" type="application/json">{"base": "..", "features": {"navigation.tabs": true}, "translations": {"clipboard.copy": "Copy to clipboard", "clipboard.copied": "Copied to clipboard", "search.config.lang": "en", "search.config.pipeline": "trimmer, stopWordFilter", "search.config.separator": "[\\s\\-]+", "search.placeholder": "Search", "search.result.placeholder": "Type to start searching", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.term.missing": "Missing", "select.version.title": "Select version"}, "search": "../assets/javascripts/workers/search.f8263e09.min.js", "version": null}</script>
|
||||
|
||||
|
||||
<script src="../assets/javascripts/bundle.4fc53ad4.min.js"></script>
|
||||
|
||||
<script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
|
||||
|
||||
<script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user