Compare commits
225 Commits
LO_Module1
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| a6045b3ddb | |||
| e6fd4c16d2 | |||
| ded05d789c | |||
| c0b7d74647 | |||
| a65e6de49c | |||
| 11d5096f6e | |||
| 4fbe5ebe43 | |||
| 020af9c5fa | |||
| 196d307de7 | |||
| 8a3cc88f5e | |||
| f66f0ff62b | |||
| 092e24eb6c | |||
| 7e13f11a2d | |||
| 1e7c141cb7 | |||
| fe4500b0ca | |||
| e830b57a4c | |||
| 0281175f7c | |||
| f24e9dc1d8 | |||
| 8daeb91a92 | |||
| 1fb6720a93 | |||
| 3d41139216 | |||
| d5a4567208 | |||
| 1739740eba | |||
| cee8c30227 | |||
| 8a09faf5a5 | |||
| 59254b4f66 | |||
| 93a5f99aea | |||
| beb77c92b9 | |||
| 5a6cedd21b | |||
| cd322fb154 | |||
| 380eafa8d1 | |||
| 6aa4a58420 | |||
| f64aa8339b | |||
| 568f47eccd | |||
| 57edfa075f | |||
| 6f35486ac6 | |||
| 7fb9aa7c5b | |||
| f0ad0f2c75 | |||
| 3f062330c7 | |||
| 37e37b3c9c | |||
| be1fe3f071 | |||
| 8f67b3358d | |||
| 20f14e5a5c | |||
| 191d9fe23d | |||
| eea47ecfe5 | |||
| 4345a33d7f | |||
| 5f27b7ceb5 | |||
| b8da62cb88 | |||
| c1d6fde336 | |||
| 3ad1730500 | |||
| 6bdf7883a5 | |||
| 6b2e40d70a | |||
| ed96de4b49 | |||
| 4746212309 | |||
| 8c0bb0b43e | |||
| 1ad93cd381 | |||
| 2096b9e2a1 | |||
| beeb9e6454 | |||
| af646fa483 | |||
| 71d5e47ea0 | |||
| 744316473e | |||
| 6c045a2fa7 | |||
| ef377950c0 | |||
| 2990b7f14e | |||
| 44fc08d7db | |||
| 7caa2ff237 | |||
| 5801672ec8 | |||
| 4877ceb245 | |||
| 77340b1d79 | |||
| 177b1a8c18 | |||
| 5b71053758 | |||
| 9f8d5d12fe | |||
| ae86a2001d | |||
| 9d62b2cc61 | |||
| ab39fc3bcc | |||
| 5873a63ce9 | |||
| 89ea5e8bac | |||
| 0ac417886c | |||
| 35d50cec77 | |||
| be40774fdd | |||
| 1b516814d8 | |||
| eee41d9457 | |||
| eea2214132 | |||
| e9b3a17b9c | |||
| b94ab37921 | |||
| ae09f9b79d | |||
| f940e69e52 | |||
| a7caea6294 | |||
| 889b748f27 | |||
| 22134a14f1 | |||
| ee48f1d3f8 | |||
| 884f9f0350 | |||
| fe849fdf5c | |||
| e719405956 | |||
| 1ca12378ff | |||
| 624efa10ab | |||
| da36243d1c | |||
| ddc22c29ab | |||
| 19be2ed8f4 | |||
| db7f42d882 | |||
| 98f6a4df08 | |||
| 762b0ce4b9 | |||
| c9fae602b4 | |||
| 51d4241650 | |||
| 1dd47ba96c | |||
| a7393a4063 | |||
| 45991f4254 | |||
| b7a5d61406 | |||
| afdf9508e6 | |||
| b44834ff60 | |||
| c5a06cf150 | |||
| 770197cbe3 | |||
| cb874911ba | |||
| 782acf26ce | |||
| 1c7926a713 | |||
| 68f0e6cb53 | |||
| b17729fa9a | |||
| 7de55821ee | |||
| 8a56888246 | |||
| c3e5ef4518 | |||
| f31e2fe93a | |||
| 36c29eaf1b | |||
| 2ab335505c | |||
| 3fabb1cfda | |||
| baa2ea4cf7 | |||
| 4553062578 | |||
| d3dabf2b81 | |||
| 46e15f69e7 | |||
| d2e59f2350 | |||
| da6a842ee7 | |||
| d763f07395 | |||
| 427d17d012 | |||
| 51a9c95b7d | |||
| 6a2b86d8af | |||
| e659ff26b8 | |||
| 6bc22c63cf | |||
| 0f9b564bce | |||
| fe4419866d | |||
| 53b2676115 | |||
| c0c772b8ce | |||
| 4117ce9f5d | |||
| b1ad88253c | |||
| 049dd34c6c | |||
| 1efd2a236c | |||
| 72c4c821dc | |||
| 68e8e1a9cb | |||
| 261b50d042 | |||
| b269844ea3 | |||
| 35b99817dc | |||
| 78a5940578 | |||
| 13a7752e5e | |||
| 3af1021228 | |||
| f641f94a25 | |||
| 0563fb5ff7 | |||
| a64e90ac36 | |||
| e69c289b40 | |||
| 69bc9aec1b | |||
| fe176c1679 | |||
| d9cb16e282 | |||
| 6d2f1aa7e8 | |||
| 390b2f6994 | |||
| ef6791e1cf | |||
| 865849b0ef | |||
| 9249bfba29 | |||
| bb43aa52e4 | |||
| 9a6d7878fd | |||
| fe0b744ffe | |||
| dbe68cd993 | |||
| a00f31fb85 | |||
| 9882dd7411 | |||
| f46e0044b9 | |||
| 38087a646d | |||
| 4617e63ddd | |||
| 738c22f91b | |||
| d576cfb1c9 | |||
| af248385c0 | |||
| 7abbbde00e | |||
| dd84d736bc | |||
| 6ae0b18eea | |||
| e9c8748e29 | |||
| a6fda6d5ca | |||
| ee88d7f230 | |||
| 7a251b614b | |||
| b6901c05bf | |||
| 9e89d9849e | |||
| 2a59822b4a | |||
| f8221f25be | |||
| 9c219f7fdc | |||
| 5703a49efd | |||
| 7e2c7f94c4 | |||
| 20671b4b48 | |||
| 1d7f51ffaf | |||
| 43b2104fa9 | |||
| b11c9cb1e3 | |||
| ee0546ba0a | |||
| 1decc32b8d | |||
| 178fe94ed8 | |||
| a5e008b498 | |||
| ebcb10c8ab | |||
| cb55908a7c | |||
| 34a63cff05 | |||
| 3e247158a4 | |||
| 11c60f66c7 | |||
| 594faf0f32 | |||
| 2bb25463ea | |||
| bbe191aecc | |||
| fa39a9d342 | |||
| e4cb817399 | |||
| 5259facfb4 | |||
| 130a508a65 | |||
| dce01a2794 | |||
| 142b9f4ee4 | |||
| d18ceb6044 | |||
| 0e0aae68b4 | |||
| 468aacb1ef | |||
| 860833525a | |||
| 2418faf718 | |||
| 325131f959 | |||
| 8c455873fd | |||
| be68361c40 | |||
| bfef9aa2fb | |||
| 9847430ca7 | |||
| 960fed9828 | |||
| 3f5cefcdd7 | |||
| 57c7ce33f8 |
@ -1,93 +0,0 @@
|
||||
# See here for image contents: https://github.com/microsoft/vscode-dev-containers/tree/v0.177.0/containers/go/.devcontainer/base.Dockerfile
|
||||
|
||||
# [Choice] Go version (use -bullseye variants on local arm64/Apple Silicon): 1, 1.16, 1.17, 1-bullseye, 1.16-bullseye, 1.17-bullseye, 1-buster, 1.16-buster, 1.17-buster
|
||||
ARG VARIANT=1-bullseye
|
||||
FROM mcr.microsoft.com/vscode/devcontainers/go:0-${VARIANT}
|
||||
|
||||
# [Choice] Node.js version: none, lts/*, 16, 14, 12, 10
|
||||
ARG NODE_VERSION="none"
|
||||
RUN if [ "${NODE_VERSION}" != "none" ]; then su vscode -c "umask 0002 && . /usr/local/share/nvm/nvm.sh && nvm install ${NODE_VERSION} 2>&1"; fi
|
||||
|
||||
# Install powershell
|
||||
ARG PS_VERSION="7.2.1"
|
||||
# powershell-7.3.0-linux-x64.tar.gz
|
||||
# powershell-7.3.0-linux-arm64.tar.gz
|
||||
RUN ARCH="$(dpkg --print-architecture)"; \
|
||||
if [ "${ARCH}" = "amd64" ]; then \
|
||||
PS_BIN="v$PS_VERSION/powershell-$PS_VERSION-linux-x64.tar.gz"; \
|
||||
elif [ "${ARCH}" = "arm64" ]; then \
|
||||
PS_BIN="v$PS_VERSION/powershell-$PS_VERSION-linux-arm64.tar.gz"; \
|
||||
elif [ "${ARCH}" = "armhf" ]; then \
|
||||
PS_BIN="v$PS_VERSION/powershell-$PS_VERSION-linux-arm32.tar.gz"; \
|
||||
fi; \
|
||||
wget https://github.com/PowerShell/PowerShell/releases/download/$PS_BIN -O pwsh.tar.gz; \
|
||||
mkdir /usr/local/pwsh && \
|
||||
tar Cxvfz /usr/local/pwsh pwsh.tar.gz && \
|
||||
rm pwsh.tar.gz
|
||||
|
||||
ENV PATH=$PATH:/usr/local/pwsh
|
||||
|
||||
RUN echo 'deb http://download.opensuse.org/repositories/shells:/fish:/release:/3/Debian_11/ /' | tee /etc/apt/sources.list.d/shells:fish:release:3.list; \
|
||||
curl -fsSL https://download.opensuse.org/repositories/shells:fish:release:3/Debian_11/Release.key | gpg --dearmor | tee /etc/apt/trusted.gpg.d/shells_fish_release_3.gpg > /dev/null; \
|
||||
apt-get update && export DEBIAN_FRONTEND=noninteractive \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
fish \
|
||||
tmux \
|
||||
fzf \
|
||||
&& apt-get clean
|
||||
|
||||
ARG USERNAME=vscode
|
||||
|
||||
# Download the oh-my-posh binary
|
||||
RUN mkdir /home/${USERNAME}/bin; \
|
||||
wget https://github.com/JanDeDobbeleer/oh-my-posh/releases/latest/download/posh-linux-$(dpkg --print-architecture) -O /home/${USERNAME}/bin/oh-my-posh; \
|
||||
chmod +x /home/${USERNAME}/bin/oh-my-posh; \
|
||||
chown ${USERNAME}: /home/${USERNAME}/bin;
|
||||
|
||||
# NOTE: devcontainers are Linux-only at this time but when
|
||||
# Windows or Darwin is supported someone will need to improve
|
||||
# the code logic above.
|
||||
|
||||
# Setup a neat little PowerShell experience
|
||||
RUN pwsh -Command Install-Module posh-git -Scope AllUsers -Force; \
|
||||
pwsh -Command Install-Module z -Scope AllUsers -Force; \
|
||||
pwsh -Command Install-Module PSFzf -Scope AllUsers -Force; \
|
||||
pwsh -Command Install-Module Terminal-Icons -Scope AllUsers -Force;
|
||||
|
||||
# add the oh-my-posh path to the PATH variable
|
||||
ENV PATH "$PATH:/home/${USERNAME}/bin"
|
||||
|
||||
# Can be used to override the devcontainer prompt default theme:
|
||||
ENV POSH_THEME="https://raw.githubusercontent.com/JanDeDobbeleer/oh-my-posh/main/themes/clean-detailed.omp.json"
|
||||
|
||||
# Deploy oh-my-posh prompt to Powershell:
|
||||
COPY Microsoft.PowerShell_profile.ps1 /home/${USERNAME}/.config/powershell/Microsoft.PowerShell_profile.ps1
|
||||
|
||||
# Deploy oh-my-posh prompt to Fish:
|
||||
COPY config.fish /home/${USERNAME}/.config/fish/config.fish
|
||||
|
||||
# Everything runs as root during build time, so we want
|
||||
# to make sure the vscode user can edit these paths too:
|
||||
RUN chmod 777 -R /home/${USERNAME}/.config
|
||||
|
||||
# Override vscode's own Bash prompt with oh-my-posh:
|
||||
RUN sed -i 's/^__bash_prompt$/#&/' /home/${USERNAME}/.bashrc && \
|
||||
echo "eval \"\$(oh-my-posh init bash --config $POSH_THEME)\"" >> /home/${USERNAME}/.bashrc
|
||||
|
||||
# Override vscode's own ZSH prompt with oh-my-posh:
|
||||
RUN echo "eval \"\$(oh-my-posh init zsh --config $POSH_THEME)\"" >> /home/${USERNAME}/.zshrc
|
||||
|
||||
# Set container timezone:
|
||||
ARG TZ="UTC"
|
||||
RUN ln -sf /usr/share/zoneinfo/${TZ} /etc/localtime
|
||||
|
||||
# Required for Python - Confluent Kafka on M1 Silicon
|
||||
RUN apt update && apt -y install software-properties-common gcc
|
||||
RUN git clone https://github.com/edenhill/librdkafka
|
||||
RUN cd librdkafka && ./configure && make && make install && ldconfig
|
||||
|
||||
# [Optional] Uncomment the next line to use go get to install anything else you need
|
||||
# RUN go get -x github.com/JanDeDobbeleer/battery
|
||||
|
||||
# [Optional] Uncomment this line to install global node packages.
|
||||
# RUN su vscode -c "source /usr/local/share/nvm/nvm.sh && npm install -g <your-package-here>" 2>&1
|
||||
@ -1,14 +0,0 @@
|
||||
Import-Module posh-git
|
||||
Import-Module PSFzf -ArgumentList 'Ctrl+t', 'Ctrl+r'
|
||||
Import-Module z
|
||||
Import-Module Terminal-Icons
|
||||
|
||||
Set-PSReadlineKeyHandler -Key Tab -Function MenuComplete
|
||||
|
||||
$env:POSH_GIT_ENABLED=$true
|
||||
oh-my-posh init pwsh --config $env:POSH_THEME | Invoke-Expression
|
||||
|
||||
# NOTE: You can override the above env var from the devcontainer.json "args" under the "build" key.
|
||||
|
||||
# Aliases
|
||||
Set-Alias -Name ac -Value Add-Content
|
||||
@ -1,58 +0,0 @@
|
||||
# Devcontainer for DataTalksClub Data Engineering Zoomcamp
|
||||
This devcontainer sets up a development environment for this class. This can be used with both VS Code and GitHub Codespaces.
|
||||
|
||||
## Getting Started
|
||||
To continue, make sure you have [Visual Studio Code](https://code.visualstudio.com/) and [Docker Desktop](https://www.docker.com/products/docker-desktop/) installed OR use [GitHub Codespaces](https://github.com/features/codespaces).
|
||||
|
||||
**Option 1: Local VS Code**
|
||||
|
||||
1. Clone the repo and connect to it in VS Code:
|
||||
|
||||
```bash
|
||||
$ cd your/desired/repo/location
|
||||
$ git clone https://github.com/DataTalksClub/data-engineering-zoomcamp.git
|
||||
```
|
||||
|
||||
1. Download the [`Dev Containers`](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) extension from the VS Code marketplace. Full docs on devcontainers [here](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers)
|
||||
|
||||
2. Press Cmd + Shift + P (Mac) or Ctrl + Shift + P (Windows) to open the Command Pallette. Type in `Dev Containers: Open Folder in Container` and select the repo directory
|
||||
|
||||
3. Wait for the container to build and the dependencies to install
|
||||
|
||||
**Option 2: GitHub Codespaces**
|
||||
|
||||
1. Fork this repo
|
||||
|
||||
2. From the repo page in GitHub, select the green `<> Code` button and choose Codespaces
|
||||
|
||||
3. Click `Create Codespace on Main`, or checkout a branch if you prefer
|
||||
|
||||
4. Wait for the container to build and the dependencies to install
|
||||
|
||||
5. Start developing!
|
||||
|
||||
|
||||
## Included Tools and Languages:
|
||||
|
||||
* `Python 3.9`
|
||||
- `Pandas`
|
||||
- `SQLAlchemy`
|
||||
- `PySpark`
|
||||
- `PyArrow`
|
||||
- `Polars`
|
||||
- `Prefect 2.7.7` and all required Python dependencies
|
||||
- `confluent-kafka`
|
||||
* `Google Cloud SDK`
|
||||
* `dbt-core`
|
||||
- `dbt-postgres`
|
||||
- `dbt-bigquery`
|
||||
* `Terraform`
|
||||
* `Jupyter Notebooks for VS Code`
|
||||
* `Docker`
|
||||
* `Spark`
|
||||
* `JDK` version 11
|
||||
* [`Oh-My-Posh Powershell themes`](https://github.com/JanDeDobbeleer/oh-my-posh)
|
||||
* Popular VS Code themes (GitHub, Atom One, Material Icons etc.)
|
||||
|
||||
## Customization
|
||||
Feel free to modify the `Dockerfile`, `devcontainer.json` or `requirements.txt` file to include any other tools or packages that you need for your development environment. In the Dockerfile, you can customize the `POSH_THEME` environment variable with a theme of your choosing from [here](https://ohmyposh.dev/docs/themes)
|
||||
@ -1,4 +0,0 @@
|
||||
# Activate oh-my-posh prompt:
|
||||
oh-my-posh init fish --config $POSH_THEME | source
|
||||
|
||||
# NOTE: You can override the above env vars from the devcontainer.json "args" under the "build" key.
|
||||
@ -1,117 +0,0 @@
|
||||
// For format details, see https://aka.ms/devcontainer.json. For config options, see the README at:
|
||||
// https://github.com/microsoft/vscode-dev-containers/tree/v0.177.0/containers/go
|
||||
{
|
||||
"name": "oh-my-posh",
|
||||
"build": {
|
||||
"dockerfile": "Dockerfile",
|
||||
"args": {
|
||||
// Update the VARIANT arg to pick a version of Go: 1, 1.16, 1.17
|
||||
// Append -bullseye or -buster to pin to an OS version.
|
||||
// Use -bullseye variants on local arm64/Apple Silicon.
|
||||
"VARIANT": "1.19-bullseye",
|
||||
// Options:
|
||||
|
||||
"POSH_THEME": "https://raw.githubusercontent.com/JanDeDobbeleer/oh-my-posh/main/themes/clean-detailed.omp.json",
|
||||
|
||||
// Override me with your own timezone:
|
||||
"TZ": "America/Moncton",
|
||||
// Use one of the "TZ database name" entries from:
|
||||
// https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
|
||||
|
||||
"NODE_VERSION": "lts/*",
|
||||
//Powershell version
|
||||
"PS_VERSION": "7.2.7"
|
||||
}
|
||||
},
|
||||
"runArgs": ["--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined"],
|
||||
|
||||
"features": {
|
||||
"ghcr.io/devcontainers/features/azure-cli:1": {
|
||||
"version": "latest"
|
||||
},
|
||||
"ghcr.io/devcontainers/features/python:1": {
|
||||
"version": "3.9"
|
||||
},
|
||||
"ghcr.io/devcontainers-contrib/features/curl-apt-get:1": {},
|
||||
"ghcr.io/devcontainers-contrib/features/terraform-asdf:2": {},
|
||||
"ghcr.io/devcontainers-contrib/features/yamllint:2": {},
|
||||
"ghcr.io/devcontainers/features/docker-in-docker:2": {},
|
||||
"ghcr.io/devcontainers/features/docker-outside-of-docker:1": {},
|
||||
"ghcr.io/devcontainers/features/github-cli:1": {},
|
||||
"ghcr.io/devcontainers-contrib/features/spark-sdkman:2": {
|
||||
"jdkVersion": "11"
|
||||
},
|
||||
"ghcr.io/dhoeric/features/google-cloud-cli:1": {
|
||||
"version": "latest"
|
||||
}
|
||||
},
|
||||
|
||||
// Set *default* container specific settings.json values on container create.
|
||||
"customizations": {
|
||||
"vscode": {
|
||||
"settings": {
|
||||
"go.toolsManagement.checkForUpdates": "local",
|
||||
"go.useLanguageServer": true,
|
||||
"go.gopath": "/go",
|
||||
"go.goroot": "/usr/local/go",
|
||||
"terminal.integrated.profiles.linux": {
|
||||
"bash": {
|
||||
"path": "bash"
|
||||
},
|
||||
"zsh": {
|
||||
"path": "zsh"
|
||||
},
|
||||
"fish": {
|
||||
"path": "fish"
|
||||
},
|
||||
"tmux": {
|
||||
"path": "tmux",
|
||||
"icon": "terminal-tmux"
|
||||
},
|
||||
"pwsh": {
|
||||
"path": "pwsh",
|
||||
"icon": "terminal-powershell"
|
||||
}
|
||||
},
|
||||
"terminal.integrated.defaultProfile.linux": "pwsh",
|
||||
"terminal.integrated.defaultProfile.windows": "pwsh",
|
||||
"terminal.integrated.defaultProfile.osx": "pwsh",
|
||||
"tasks.statusbar.default.hide": true,
|
||||
"terminal.integrated.tabs.defaultIcon": "terminal-powershell",
|
||||
"terminal.integrated.tabs.defaultColor": "terminal.ansiBlue",
|
||||
"workbench.colorTheme": "GitHub Dark Dimmed",
|
||||
"workbench.iconTheme": "material-icon-theme"
|
||||
},
|
||||
|
||||
// Add the IDs of extensions you want installed when the container is created.
|
||||
"extensions": [
|
||||
"actboy168.tasks",
|
||||
"eamodio.gitlens",
|
||||
"davidanson.vscode-markdownlint",
|
||||
"editorconfig.editorconfig",
|
||||
"esbenp.prettier-vscode",
|
||||
"github.vscode-pull-request-github",
|
||||
"golang.go",
|
||||
"ms-vscode.powershell",
|
||||
"redhat.vscode-yaml",
|
||||
"yzhang.markdown-all-in-one",
|
||||
"ms-python.python",
|
||||
"ms-python.vscode-pylance",
|
||||
"ms-toolsai.jupyter",
|
||||
"akamud.vscode-theme-onedark",
|
||||
"ms-vscode-remote.remote-containers",
|
||||
"PKief.material-icon-theme",
|
||||
"GitHub.github-vscode-theme"
|
||||
]
|
||||
}
|
||||
},
|
||||
|
||||
// Use 'forwardPorts' to make a list of ports inside the container available locally.
|
||||
// "forwardPorts": [3000],
|
||||
|
||||
// Use 'postCreateCommand' to run commands after the container is created.
|
||||
"postCreateCommand": "pip3 install --user -r .devcontainer/requirements.txt --use-pep517",
|
||||
|
||||
// Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
|
||||
"remoteUser": "vscode"
|
||||
}
|
||||
@ -1,16 +0,0 @@
|
||||
pandas==1.5.2
|
||||
prefect==2.7.7
|
||||
prefect-sqlalchemy==0.2.2
|
||||
prefect-gcp[cloud_storage]==0.2.4
|
||||
protobuf
|
||||
pyarrow==10.0.1
|
||||
pandas-gbq==0.18.1
|
||||
psycopg2-binary==2.9.5
|
||||
sqlalchemy==1.4.46
|
||||
ipykernel
|
||||
polars
|
||||
dbt-core
|
||||
dbt-bigquery
|
||||
dbt-postgres
|
||||
pyspark
|
||||
confluent-kafka==1.9.2
|
||||
@ -113,6 +113,10 @@ $ aws s3 ls s3://nyc-tlc
|
||||
PRE trip data/
|
||||
```
|
||||
|
||||
You can refer the `data-loading-parquet.ipynb` and `data-loading-parquet.py` for code to handle both csv and paraquet files. (The lookup zones table which is needed later in this course is a csv file)
|
||||
> Note: You will need to install the `pyarrow` library. (add it to your Dockerfile)
|
||||
|
||||
|
||||
### pgAdmin
|
||||
|
||||
Running pgAdmin
|
||||
938
01-docker-terraform/2_docker_sql/data-loading-parquet.ipynb
Normal file
938
01-docker-terraform/2_docker_sql/data-loading-parquet.ipynb
Normal file
@ -0,0 +1,938 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "52bad16a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Data loading \n",
|
||||
"\n",
|
||||
"Here we will be using the ```.paraquet``` file we downloaded and do the following:\n",
|
||||
" - Check metadata and table datatypes of the paraquet file/table\n",
|
||||
" - Convert the paraquet file to pandas dataframe and check the datatypes. Additionally check the data dictionary to make sure you have the right datatypes in pandas, as pandas will automatically create the table in our database.\n",
|
||||
" - Generate the DDL CREATE statement from pandas for a sanity check.\n",
|
||||
" - Create a connection to our database using SQLAlchemy\n",
|
||||
" - Convert our huge paraquet file into a iterable that has batches of 100,000 rows and load it into our database."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "afef2456",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-12-03T23:55:14.141738Z",
|
||||
"start_time": "2023-12-03T23:55:14.124217Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd \n",
|
||||
"import pyarrow.parquet as pq\n",
|
||||
"from time import time"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c750d1d4",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-12-03T02:54:01.925350Z",
|
||||
"start_time": "2023-12-03T02:54:01.661119Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<pyarrow._parquet.FileMetaData object at 0x7fed89ffa540>\n",
|
||||
" created_by: parquet-cpp-arrow version 13.0.0\n",
|
||||
" num_columns: 19\n",
|
||||
" num_rows: 2846722\n",
|
||||
" num_row_groups: 3\n",
|
||||
" format_version: 2.6\n",
|
||||
" serialized_size: 6357"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Read metadata \n",
|
||||
"pq.read_metadata('yellow_tripdata_2023-09.parquet')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a970fcf0",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-12-03T23:28:08.411945Z",
|
||||
"start_time": "2023-12-03T23:28:08.177693Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"VendorID: int32\n",
|
||||
"tpep_pickup_datetime: timestamp[us]\n",
|
||||
"tpep_dropoff_datetime: timestamp[us]\n",
|
||||
"passenger_count: int64\n",
|
||||
"trip_distance: double\n",
|
||||
"RatecodeID: int64\n",
|
||||
"store_and_fwd_flag: large_string\n",
|
||||
"PULocationID: int32\n",
|
||||
"DOLocationID: int32\n",
|
||||
"payment_type: int64\n",
|
||||
"fare_amount: double\n",
|
||||
"extra: double\n",
|
||||
"mta_tax: double\n",
|
||||
"tip_amount: double\n",
|
||||
"tolls_amount: double\n",
|
||||
"improvement_surcharge: double\n",
|
||||
"total_amount: double\n",
|
||||
"congestion_surcharge: double\n",
|
||||
"Airport_fee: double"
|
||||
]
|
||||
},
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Read file, read the table from file and check schema\n",
|
||||
"file = pq.ParquetFile('yellow_tripdata_2023-09.parquet')\n",
|
||||
"table = file.read()\n",
|
||||
"table.schema"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "43f6ea7e",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-12-03T23:28:22.870376Z",
|
||||
"start_time": "2023-12-03T23:28:22.563414Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"RangeIndex: 2846722 entries, 0 to 2846721\n",
|
||||
"Data columns (total 19 columns):\n",
|
||||
" # Column Dtype \n",
|
||||
"--- ------ ----- \n",
|
||||
" 0 VendorID int32 \n",
|
||||
" 1 tpep_pickup_datetime datetime64[ns]\n",
|
||||
" 2 tpep_dropoff_datetime datetime64[ns]\n",
|
||||
" 3 passenger_count float64 \n",
|
||||
" 4 trip_distance float64 \n",
|
||||
" 5 RatecodeID float64 \n",
|
||||
" 6 store_and_fwd_flag object \n",
|
||||
" 7 PULocationID int32 \n",
|
||||
" 8 DOLocationID int32 \n",
|
||||
" 9 payment_type int64 \n",
|
||||
" 10 fare_amount float64 \n",
|
||||
" 11 extra float64 \n",
|
||||
" 12 mta_tax float64 \n",
|
||||
" 13 tip_amount float64 \n",
|
||||
" 14 tolls_amount float64 \n",
|
||||
" 15 improvement_surcharge float64 \n",
|
||||
" 16 total_amount float64 \n",
|
||||
" 17 congestion_surcharge float64 \n",
|
||||
" 18 Airport_fee float64 \n",
|
||||
"dtypes: datetime64[ns](2), float64(12), int32(3), int64(1), object(1)\n",
|
||||
"memory usage: 380.1+ MB\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Convert to pandas and check data \n",
|
||||
"df = table.to_pandas()\n",
|
||||
"df.info()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ccf039a0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We need to first create the connection to our postgres database. We can feed the connection information to generate the CREATE SQL query for the specific server. SQLAlchemy supports a variety of servers."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "44e701ae",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-12-03T22:50:25.811951Z",
|
||||
"start_time": "2023-12-03T22:50:25.393987Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<sqlalchemy.engine.base.Connection at 0x7fed98ea3190>"
|
||||
]
|
||||
},
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Create an open SQL database connection object or a SQLAlchemy connectable\n",
|
||||
"from sqlalchemy import create_engine\n",
|
||||
"\n",
|
||||
"engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')\n",
|
||||
"engine.connect()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c96a1075",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-12-03T22:50:43.628727Z",
|
||||
"start_time": "2023-12-03T22:50:43.442337Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"CREATE TABLE yellow_taxi_data (\n",
|
||||
"\t\"VendorID\" INTEGER, \n",
|
||||
"\ttpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, \n",
|
||||
"\ttpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, \n",
|
||||
"\tpassenger_count FLOAT(53), \n",
|
||||
"\ttrip_distance FLOAT(53), \n",
|
||||
"\t\"RatecodeID\" FLOAT(53), \n",
|
||||
"\tstore_and_fwd_flag TEXT, \n",
|
||||
"\t\"PULocationID\" INTEGER, \n",
|
||||
"\t\"DOLocationID\" INTEGER, \n",
|
||||
"\tpayment_type BIGINT, \n",
|
||||
"\tfare_amount FLOAT(53), \n",
|
||||
"\textra FLOAT(53), \n",
|
||||
"\tmta_tax FLOAT(53), \n",
|
||||
"\ttip_amount FLOAT(53), \n",
|
||||
"\ttolls_amount FLOAT(53), \n",
|
||||
"\timprovement_surcharge FLOAT(53), \n",
|
||||
"\ttotal_amount FLOAT(53), \n",
|
||||
"\tcongestion_surcharge FLOAT(53), \n",
|
||||
"\t\"Airport_fee\" FLOAT(53)\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Generate CREATE SQL statement from schema for validation\n",
|
||||
"print(pd.io.sql.get_schema(df, name='yellow_taxi_data', con=engine))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "eca7f32d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Datatypes for the table looks good! Since we used paraquet file the datasets seem to have been preserved. You may have to convert some datatypes so it is always good to do this check."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "51a751ed",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Finally inserting data\n",
|
||||
"\n",
|
||||
"There are 2,846,722 rows in our dataset. We are going to use the ```parquet_file.iter_batches()``` function to create batches of 100,000, convert them into pandas and then load it into the postgres database."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e20cec73",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-12-03T23:49:28.768786Z",
|
||||
"start_time": "2023-12-03T23:49:28.689732Z"
|
||||
},
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>VendorID</th>\n",
|
||||
" <th>tpep_pickup_datetime</th>\n",
|
||||
" <th>tpep_dropoff_datetime</th>\n",
|
||||
" <th>passenger_count</th>\n",
|
||||
" <th>trip_distance</th>\n",
|
||||
" <th>RatecodeID</th>\n",
|
||||
" <th>store_and_fwd_flag</th>\n",
|
||||
" <th>PULocationID</th>\n",
|
||||
" <th>DOLocationID</th>\n",
|
||||
" <th>payment_type</th>\n",
|
||||
" <th>fare_amount</th>\n",
|
||||
" <th>extra</th>\n",
|
||||
" <th>mta_tax</th>\n",
|
||||
" <th>tip_amount</th>\n",
|
||||
" <th>tolls_amount</th>\n",
|
||||
" <th>improvement_surcharge</th>\n",
|
||||
" <th>total_amount</th>\n",
|
||||
" <th>congestion_surcharge</th>\n",
|
||||
" <th>Airport_fee</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>2023-09-01 00:15:37</td>\n",
|
||||
" <td>2023-09-01 00:20:21</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0.80</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>N</td>\n",
|
||||
" <td>163</td>\n",
|
||||
" <td>230</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>6.5</td>\n",
|
||||
" <td>3.5</td>\n",
|
||||
" <td>0.5</td>\n",
|
||||
" <td>0.00</td>\n",
|
||||
" <td>0.00</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>11.50</td>\n",
|
||||
" <td>2.5</td>\n",
|
||||
" <td>0.00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>2023-09-01 00:18:40</td>\n",
|
||||
" <td>2023-09-01 00:30:28</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>2.34</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>N</td>\n",
|
||||
" <td>236</td>\n",
|
||||
" <td>233</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>14.2</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.5</td>\n",
|
||||
" <td>2.00</td>\n",
|
||||
" <td>0.00</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>21.20</td>\n",
|
||||
" <td>2.5</td>\n",
|
||||
" <td>0.00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>2023-09-01 00:35:01</td>\n",
|
||||
" <td>2023-09-01 00:39:04</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1.62</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>N</td>\n",
|
||||
" <td>162</td>\n",
|
||||
" <td>236</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>8.6</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.5</td>\n",
|
||||
" <td>2.00</td>\n",
|
||||
" <td>0.00</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>15.60</td>\n",
|
||||
" <td>2.5</td>\n",
|
||||
" <td>0.00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>2023-09-01 00:45:45</td>\n",
|
||||
" <td>2023-09-01 00:47:37</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0.74</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>N</td>\n",
|
||||
" <td>141</td>\n",
|
||||
" <td>229</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>5.1</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.5</td>\n",
|
||||
" <td>1.00</td>\n",
|
||||
" <td>0.00</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>11.10</td>\n",
|
||||
" <td>2.5</td>\n",
|
||||
" <td>0.00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>2023-09-01 00:01:23</td>\n",
|
||||
" <td>2023-09-01 00:38:05</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>9.85</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>N</td>\n",
|
||||
" <td>138</td>\n",
|
||||
" <td>230</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>45.0</td>\n",
|
||||
" <td>6.0</td>\n",
|
||||
" <td>0.5</td>\n",
|
||||
" <td>17.02</td>\n",
|
||||
" <td>0.00</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>73.77</td>\n",
|
||||
" <td>2.5</td>\n",
|
||||
" <td>1.75</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>99995</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>2023-09-02 09:55:17</td>\n",
|
||||
" <td>2023-09-02 10:01:45</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>1.48</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>N</td>\n",
|
||||
" <td>163</td>\n",
|
||||
" <td>164</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>9.3</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.5</td>\n",
|
||||
" <td>2.66</td>\n",
|
||||
" <td>0.00</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>15.96</td>\n",
|
||||
" <td>2.5</td>\n",
|
||||
" <td>0.00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>99996</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>2023-09-02 09:25:34</td>\n",
|
||||
" <td>2023-09-02 09:55:20</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>17.49</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>N</td>\n",
|
||||
" <td>132</td>\n",
|
||||
" <td>164</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>70.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.5</td>\n",
|
||||
" <td>24.28</td>\n",
|
||||
" <td>6.94</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>106.97</td>\n",
|
||||
" <td>2.5</td>\n",
|
||||
" <td>1.75</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>99997</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>2023-09-02 09:57:55</td>\n",
|
||||
" <td>2023-09-02 10:04:52</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1.73</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>N</td>\n",
|
||||
" <td>164</td>\n",
|
||||
" <td>249</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>10.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.5</td>\n",
|
||||
" <td>2.80</td>\n",
|
||||
" <td>0.00</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>16.80</td>\n",
|
||||
" <td>2.5</td>\n",
|
||||
" <td>0.00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>99998</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>2023-09-02 09:35:02</td>\n",
|
||||
" <td>2023-09-02 09:43:28</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1.32</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>N</td>\n",
|
||||
" <td>113</td>\n",
|
||||
" <td>170</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>10.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.5</td>\n",
|
||||
" <td>4.20</td>\n",
|
||||
" <td>0.00</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>18.20</td>\n",
|
||||
" <td>2.5</td>\n",
|
||||
" <td>0.00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>99999</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>2023-09-02 09:46:09</td>\n",
|
||||
" <td>2023-09-02 10:03:58</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>8.79</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>N</td>\n",
|
||||
" <td>138</td>\n",
|
||||
" <td>170</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>35.9</td>\n",
|
||||
" <td>5.0</td>\n",
|
||||
" <td>0.5</td>\n",
|
||||
" <td>10.37</td>\n",
|
||||
" <td>6.94</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>63.96</td>\n",
|
||||
" <td>2.5</td>\n",
|
||||
" <td>1.75</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>100000 rows × 19 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count \\\n",
|
||||
"0 1 2023-09-01 00:15:37 2023-09-01 00:20:21 1 \n",
|
||||
"1 2 2023-09-01 00:18:40 2023-09-01 00:30:28 2 \n",
|
||||
"2 2 2023-09-01 00:35:01 2023-09-01 00:39:04 1 \n",
|
||||
"3 2 2023-09-01 00:45:45 2023-09-01 00:47:37 1 \n",
|
||||
"4 2 2023-09-01 00:01:23 2023-09-01 00:38:05 1 \n",
|
||||
"... ... ... ... ... \n",
|
||||
"99995 2 2023-09-02 09:55:17 2023-09-02 10:01:45 2 \n",
|
||||
"99996 2 2023-09-02 09:25:34 2023-09-02 09:55:20 3 \n",
|
||||
"99997 2 2023-09-02 09:57:55 2023-09-02 10:04:52 1 \n",
|
||||
"99998 2 2023-09-02 09:35:02 2023-09-02 09:43:28 1 \n",
|
||||
"99999 2 2023-09-02 09:46:09 2023-09-02 10:03:58 1 \n",
|
||||
"\n",
|
||||
" trip_distance RatecodeID store_and_fwd_flag PULocationID \\\n",
|
||||
"0 0.80 1 N 163 \n",
|
||||
"1 2.34 1 N 236 \n",
|
||||
"2 1.62 1 N 162 \n",
|
||||
"3 0.74 1 N 141 \n",
|
||||
"4 9.85 1 N 138 \n",
|
||||
"... ... ... ... ... \n",
|
||||
"99995 1.48 1 N 163 \n",
|
||||
"99996 17.49 2 N 132 \n",
|
||||
"99997 1.73 1 N 164 \n",
|
||||
"99998 1.32 1 N 113 \n",
|
||||
"99999 8.79 1 N 138 \n",
|
||||
"\n",
|
||||
" DOLocationID payment_type fare_amount extra mta_tax tip_amount \\\n",
|
||||
"0 230 2 6.5 3.5 0.5 0.00 \n",
|
||||
"1 233 1 14.2 1.0 0.5 2.00 \n",
|
||||
"2 236 1 8.6 1.0 0.5 2.00 \n",
|
||||
"3 229 1 5.1 1.0 0.5 1.00 \n",
|
||||
"4 230 1 45.0 6.0 0.5 17.02 \n",
|
||||
"... ... ... ... ... ... ... \n",
|
||||
"99995 164 1 9.3 0.0 0.5 2.66 \n",
|
||||
"99996 164 1 70.0 0.0 0.5 24.28 \n",
|
||||
"99997 249 1 10.0 0.0 0.5 2.80 \n",
|
||||
"99998 170 1 10.0 0.0 0.5 4.20 \n",
|
||||
"99999 170 1 35.9 5.0 0.5 10.37 \n",
|
||||
"\n",
|
||||
" tolls_amount improvement_surcharge total_amount \\\n",
|
||||
"0 0.00 1.0 11.50 \n",
|
||||
"1 0.00 1.0 21.20 \n",
|
||||
"2 0.00 1.0 15.60 \n",
|
||||
"3 0.00 1.0 11.10 \n",
|
||||
"4 0.00 1.0 73.77 \n",
|
||||
"... ... ... ... \n",
|
||||
"99995 0.00 1.0 15.96 \n",
|
||||
"99996 6.94 1.0 106.97 \n",
|
||||
"99997 0.00 1.0 16.80 \n",
|
||||
"99998 0.00 1.0 18.20 \n",
|
||||
"99999 6.94 1.0 63.96 \n",
|
||||
"\n",
|
||||
" congestion_surcharge Airport_fee \n",
|
||||
"0 2.5 0.00 \n",
|
||||
"1 2.5 0.00 \n",
|
||||
"2 2.5 0.00 \n",
|
||||
"3 2.5 0.00 \n",
|
||||
"4 2.5 1.75 \n",
|
||||
"... ... ... \n",
|
||||
"99995 2.5 0.00 \n",
|
||||
"99996 2.5 1.75 \n",
|
||||
"99997 2.5 0.00 \n",
|
||||
"99998 2.5 0.00 \n",
|
||||
"99999 2.5 1.75 \n",
|
||||
"\n",
|
||||
"[100000 rows x 19 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 66,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#This part is for testing\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Creating batches of 100,000 for the paraquet file\n",
|
||||
"batches_iter = file.iter_batches(batch_size=100000)\n",
|
||||
"batches_iter\n",
|
||||
"\n",
|
||||
"# Take the first batch for testing\n",
|
||||
"df = next(batches_iter).to_pandas()\n",
|
||||
"df\n",
|
||||
"\n",
|
||||
"# Creating just the table in postgres\n",
|
||||
"#df.head(0).to_sql(name='ny_taxi_data',con=engine, if_exists='replace')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7fdda025",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-12-04T00:08:07.651559Z",
|
||||
"start_time": "2023-12-04T00:02:35.940526Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"inserting batch 1...\n",
|
||||
"inserted! time taken 12.916 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 2...\n",
|
||||
"inserted! time taken 11.782 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 3...\n",
|
||||
"inserted! time taken 11.854 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 4...\n",
|
||||
"inserted! time taken 11.753 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 5...\n",
|
||||
"inserted! time taken 12.034 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 6...\n",
|
||||
"inserted! time taken 11.742 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 7...\n",
|
||||
"inserted! time taken 12.351 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 8...\n",
|
||||
"inserted! time taken 11.052 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 9...\n",
|
||||
"inserted! time taken 12.167 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 10...\n",
|
||||
"inserted! time taken 12.335 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 11...\n",
|
||||
"inserted! time taken 11.375 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 12...\n",
|
||||
"inserted! time taken 10.937 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 13...\n",
|
||||
"inserted! time taken 12.208 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 14...\n",
|
||||
"inserted! time taken 11.542 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 15...\n",
|
||||
"inserted! time taken 11.460 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 16...\n",
|
||||
"inserted! time taken 11.868 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 17...\n",
|
||||
"inserted! time taken 11.162 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 18...\n",
|
||||
"inserted! time taken 11.774 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 19...\n",
|
||||
"inserted! time taken 11.772 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 20...\n",
|
||||
"inserted! time taken 10.971 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 21...\n",
|
||||
"inserted! time taken 11.483 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 22...\n",
|
||||
"inserted! time taken 11.718 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 23...\n",
|
||||
"inserted! time taken 11.628 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 24...\n",
|
||||
"inserted! time taken 11.622 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 25...\n",
|
||||
"inserted! time taken 11.236 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 26...\n",
|
||||
"inserted! time taken 11.258 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 27...\n",
|
||||
"inserted! time taken 11.746 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 28...\n",
|
||||
"inserted! time taken 10.031 seconds.\n",
|
||||
"\n",
|
||||
"inserting batch 29...\n",
|
||||
"inserted! time taken 5.077 seconds.\n",
|
||||
"\n",
|
||||
"Completed! Total time taken was 331.674 seconds for 29 batches.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Insert values into the table \n",
|
||||
"t_start = time()\n",
|
||||
"count = 0\n",
|
||||
"for batch in file.iter_batches(batch_size=100000):\n",
|
||||
" count+=1\n",
|
||||
" batch_df = batch.to_pandas()\n",
|
||||
" print(f'inserting batch {count}...')\n",
|
||||
" b_start = time()\n",
|
||||
" \n",
|
||||
" batch_df.to_sql(name='ny_taxi_data',con=engine, if_exists='append')\n",
|
||||
" b_end = time()\n",
|
||||
" print(f'inserted! time taken {b_end-b_start:10.3f} seconds.\\n')\n",
|
||||
" \n",
|
||||
"t_end = time() \n",
|
||||
"print(f'Completed! Total time taken was {t_end-t_start:10.3f} seconds for {count} batches.') "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a7c102be",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Extra bit\n",
|
||||
"\n",
|
||||
"While trying to do the SQL Refresher, there was a need to add a lookup zones table but the file is in ```.csv``` format. \n",
|
||||
"\n",
|
||||
"Let's code to handle both ```.csv``` and ```.paraquet``` files!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a643d171",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-12-05T20:59:29.236458Z",
|
||||
"start_time": "2023-12-05T20:59:28.551221Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from time import time\n",
|
||||
"import pandas as pd \n",
|
||||
"import pyarrow.parquet as pq\n",
|
||||
"from sqlalchemy import create_engine"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "62c9040a",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-12-05T21:18:11.346552Z",
|
||||
"start_time": "2023-12-05T21:18:11.337475Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'yellow_tripdata_2023-09.parquet'"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"url = 'https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv'\n",
|
||||
"url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-09.parquet'\n",
|
||||
"\n",
|
||||
"file_name = url.rsplit('/', 1)[-1].strip()\n",
|
||||
"file_name"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e495fa96",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-12-05T21:18:33.001561Z",
|
||||
"start_time": "2023-12-05T21:18:32.844872Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"oh yea\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"if '.csv' in file_name:\n",
|
||||
" print('yay') \n",
|
||||
" df = pd.read_csv(file_name, nrows=10)\n",
|
||||
" df_iter = pd.read_csv(file_name, iterator=True, chunksize=100000)\n",
|
||||
"elif '.parquet' in file_name:\n",
|
||||
" print('oh yea')\n",
|
||||
" file = pq.ParquetFile(file_name)\n",
|
||||
" df = next(file.iter_batches(batch_size=10)).to_pandas()\n",
|
||||
" df_iter = file.iter_batches(batch_size=100000)\n",
|
||||
"else: \n",
|
||||
" print('Error. Only .csv or .parquet files allowed.')\n",
|
||||
" sys.exit() "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7556748f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This code is a rough code and seems to be working. The cleaned up version will be in `data-loading-parquet.py` file."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"hide_input": false,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.5"
|
||||
},
|
||||
"varInspector": {
|
||||
"cols": {
|
||||
"lenName": 16,
|
||||
"lenType": 16,
|
||||
"lenVar": 40
|
||||
},
|
||||
"kernels_config": {
|
||||
"python": {
|
||||
"delete_cmd_postfix": "",
|
||||
"delete_cmd_prefix": "del ",
|
||||
"library": "var_list.py",
|
||||
"varRefreshCmd": "print(var_dic_list())"
|
||||
},
|
||||
"r": {
|
||||
"delete_cmd_postfix": ") ",
|
||||
"delete_cmd_prefix": "rm(",
|
||||
"library": "var_list.r",
|
||||
"varRefreshCmd": "cat(var_dic_list()) "
|
||||
}
|
||||
},
|
||||
"types_to_exclude": [
|
||||
"module",
|
||||
"function",
|
||||
"builtin_function_or_method",
|
||||
"instance",
|
||||
"_Feature"
|
||||
],
|
||||
"window_display": false
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
86
01-docker-terraform/2_docker_sql/data-loading-parquet.py
Normal file
86
01-docker-terraform/2_docker_sql/data-loading-parquet.py
Normal file
@ -0,0 +1,86 @@
|
||||
#Cleaned up version of data-loading.ipynb
|
||||
import argparse, os, sys
|
||||
from time import time
|
||||
import pandas as pd
|
||||
import pyarrow.parquet as pq
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
|
||||
def main(params):
|
||||
user = params.user
|
||||
password = params.password
|
||||
host = params.host
|
||||
port = params.port
|
||||
db = params.db
|
||||
tb = params.tb
|
||||
url = params.url
|
||||
|
||||
# Get the name of the file from url
|
||||
file_name = url.rsplit('/', 1)[-1].strip()
|
||||
print(f'Downloading {file_name} ...')
|
||||
# Download file from url
|
||||
os.system(f'curl {url.strip()} -o {file_name}')
|
||||
print('\n')
|
||||
|
||||
# Create SQL engine
|
||||
engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{db}')
|
||||
|
||||
# Read file based on csv or parquet
|
||||
if '.csv' in file_name:
|
||||
df = pd.read_csv(file_name, nrows=10)
|
||||
df_iter = pd.read_csv(file_name, iterator=True, chunksize=100000)
|
||||
elif '.parquet' in file_name:
|
||||
file = pq.ParquetFile(file_name)
|
||||
df = next(file.iter_batches(batch_size=10)).to_pandas()
|
||||
df_iter = file.iter_batches(batch_size=100000)
|
||||
else:
|
||||
print('Error. Only .csv or .parquet files allowed.')
|
||||
sys.exit()
|
||||
|
||||
|
||||
# Create the table
|
||||
df.head(0).to_sql(name=tb, con=engine, if_exists='replace')
|
||||
|
||||
|
||||
# Insert values
|
||||
t_start = time()
|
||||
count = 0
|
||||
for batch in df_iter:
|
||||
count+=1
|
||||
|
||||
if '.parquet' in file_name:
|
||||
batch_df = batch.to_pandas()
|
||||
else:
|
||||
batch_df = batch
|
||||
|
||||
print(f'inserting batch {count}...')
|
||||
|
||||
b_start = time()
|
||||
batch_df.to_sql(name=tb, con=engine, if_exists='append')
|
||||
b_end = time()
|
||||
|
||||
print(f'inserted! time taken {b_end-b_start:10.3f} seconds.\n')
|
||||
|
||||
t_end = time()
|
||||
print(f'Completed! Total time taken was {t_end-t_start:10.3f} seconds for {count} batches.')
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
#Parsing arguments
|
||||
parser = argparse.ArgumentParser(description='Loading data from .paraquet file link to a Postgres datebase.')
|
||||
|
||||
parser.add_argument('--user', help='Username for Postgres.')
|
||||
parser.add_argument('--password', help='Password to the username for Postgres.')
|
||||
parser.add_argument('--host', help='Hostname for Postgres.')
|
||||
parser.add_argument('--port', help='Port for Postgres connection.')
|
||||
parser.add_argument('--db', help='Databse name for Postgres')
|
||||
parser.add_argument('--tb', help='Destination table name for Postgres.')
|
||||
parser.add_argument('--url', help='URL for .paraquet file.')
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
|
||||
|
||||
|
||||
215
01-docker-terraform/README.md
Normal file
215
01-docker-terraform/README.md
Normal file
@ -0,0 +1,215 @@
|
||||
# Introduction
|
||||
|
||||
* [](https://www.youtube.com/watch?v=AtRhA-NfS24&list=PL3MmuxUbc_hKihpnNQ9qtTmWYy26bPrSb&index=3)
|
||||
* [Slides](https://www.slideshare.net/AlexeyGrigorev/data-engineering-zoomcamp-introduction)
|
||||
* Overview of [Architecture](https://github.com/DataTalksClub/data-engineering-zoomcamp#overview), [Technologies](https://github.com/DataTalksClub/data-engineering-zoomcamp#technologies) & [Pre-Requisites](https://github.com/DataTalksClub/data-engineering-zoomcamp#prerequisites)
|
||||
|
||||
|
||||
We suggest watching videos in the same order as in this document.
|
||||
|
||||
The last video (setting up the environment) is optional, but you can check it earlier
|
||||
if you have troubles setting up the environment and following along with the videos.
|
||||
|
||||
|
||||
# Docker + Postgres
|
||||
|
||||
[Code](2_docker_sql)
|
||||
|
||||
## :movie_camera: Introduction to Docker
|
||||
|
||||
[](https://youtu.be/EYNwNlOrpr0&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=4)
|
||||
|
||||
* Why do we need Docker
|
||||
* Creating a simple "data pipeline" in Docker
|
||||
|
||||
|
||||
## :movie_camera: Ingesting NY Taxi Data to Postgres
|
||||
|
||||
[](https://youtu.be/2JM-ziJt0WI&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=5)
|
||||
|
||||
* Running Postgres locally with Docker
|
||||
* Using `pgcli` for connecting to the database
|
||||
* Exploring the NY Taxi dataset
|
||||
* Ingesting the data into the database
|
||||
|
||||
> [!TIP]
|
||||
>if you have problems with `pgcli`, check this video for an alternative way to connect to your database in jupyter notebook and pandas.
|
||||
>
|
||||
> [](https://youtu.be/3IkfkTwqHx4&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=6)
|
||||
|
||||
|
||||
## :movie_camera: Connecting pgAdmin and Postgres
|
||||
|
||||
[](https://youtu.be/hCAIVe9N0ow&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=7)
|
||||
|
||||
* The pgAdmin tool
|
||||
* Docker networks
|
||||
|
||||
|
||||
> [!IMPORTANT]
|
||||
>The UI for PgAdmin 4 has changed, please follow the below steps for creating a server:
|
||||
>
|
||||
>* After login to PgAdmin, right click Servers in the left sidebar.
|
||||
>* Click on Register.
|
||||
>* Click on Server.
|
||||
>* The remaining steps to create a server are the same as in the videos.
|
||||
|
||||
|
||||
## :movie_camera: Putting the ingestion script into Docker
|
||||
|
||||
[](https://youtu.be/B1WwATwf-vY&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=8)
|
||||
|
||||
* Converting the Jupyter notebook to a Python script
|
||||
* Parametrizing the script with argparse
|
||||
* Dockerizing the ingestion script
|
||||
|
||||
## :movie_camera: Running Postgres and pgAdmin with Docker-Compose
|
||||
|
||||
[](https://youtu.be/hKI6PkPhpa0&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=9)
|
||||
|
||||
* Why do we need Docker-compose
|
||||
* Docker-compose YAML file
|
||||
* Running multiple containers with `docker-compose up`
|
||||
|
||||
## :movie_camera: SQL refresher
|
||||
|
||||
[](https://youtu.be/QEcps_iskgg&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=10)
|
||||
|
||||
* Adding the Zones table
|
||||
* Inner joins
|
||||
* Basic data quality checks
|
||||
* Left, Right and Outer joins
|
||||
* Group by
|
||||
|
||||
## :movie_camera: Optional: Docker Networking and Port Mapping
|
||||
|
||||
> [!TIP]
|
||||
> Optional: If you have some problems with docker networking, check **Port Mapping and Networks in Docker video**.
|
||||
|
||||
[](https://youtu.be/tOr4hTsHOzU&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=5)
|
||||
|
||||
* Docker networks
|
||||
* Port forwarding to the host environment
|
||||
* Communicating between containers in the network
|
||||
* `.dockerignore` file
|
||||
|
||||
## :movie_camera: Optional: Walk-Through on WSL
|
||||
|
||||
> [!TIP]
|
||||
> Optional: If you are willing to do the steps from "Ingesting NY Taxi Data to Postgres" till "Running Postgres and pgAdmin with Docker-Compose" with Windows Subsystem Linux please check **Docker Module Walk-Through on WSL**.
|
||||
|
||||
[](https://youtu.be/Mv4zFm2AwzQ&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=33)
|
||||
|
||||
|
||||
# GCP
|
||||
|
||||
## :movie_camera: Introduction to GCP (Google Cloud Platform)
|
||||
|
||||
[](https://youtu.be/18jIzE41fJ4&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=3)
|
||||
|
||||
# Terraform
|
||||
|
||||
[Code](1_terraform_gcp)
|
||||
|
||||
## :movie_camera: Introduction Terraform: Concepts and Overview, a primer
|
||||
|
||||
[](https://youtu.be/s2bOYDCKl_M&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=11)
|
||||
|
||||
* [Companion Notes](1_terraform_gcp)
|
||||
|
||||
## :movie_camera: Terraform Basics: Simple one file Terraform Deployment
|
||||
|
||||
[](https://youtu.be/Y2ux7gq3Z0o&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=12)
|
||||
|
||||
* [Companion Notes](1_terraform_gcp)
|
||||
|
||||
## :movie_camera: Deployment with a Variables File
|
||||
|
||||
[](https://youtu.be/PBi0hHjLftk&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=13)
|
||||
|
||||
* [Companion Notes](1_terraform_gcp)
|
||||
|
||||
## Configuring terraform and GCP SDK on Windows
|
||||
|
||||
* [Instructions](1_terraform_gcp/windows.md)
|
||||
|
||||
|
||||
# Environment setup
|
||||
|
||||
For the course you'll need:
|
||||
|
||||
* Python 3 (e.g. installed with Anaconda)
|
||||
* Google Cloud SDK
|
||||
* Docker with docker-compose
|
||||
* Terraform
|
||||
* Git account
|
||||
|
||||
> [!NOTE]
|
||||
>If you have problems setting up the environment, you can check these videos.
|
||||
>
|
||||
>If you already have a working coding environment on local machine, these are optional. And only need to select one method. But if you have time to learn it now, these would be helpful if the local environment suddenly do not work one day.
|
||||
|
||||
## :movie_camera: GCP Cloud VM
|
||||
|
||||
### Setting up the environment on cloud VM
|
||||
[](https://youtu.be/ae-CV2KfoN0&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=14)
|
||||
|
||||
* Generating SSH keys
|
||||
* Creating a virtual machine on GCP
|
||||
* Connecting to the VM with SSH
|
||||
* Installing Anaconda
|
||||
* Installing Docker
|
||||
* Creating SSH `config` file
|
||||
* Accessing the remote machine with VS Code and SSH remote
|
||||
* Installing docker-compose
|
||||
* Installing pgcli
|
||||
* Port-forwarding with VS code: connecting to pgAdmin and Jupyter from the local computer
|
||||
* Installing Terraform
|
||||
* Using `sftp` for putting the credentials to the remote machine
|
||||
* Shutting down and removing the instance
|
||||
|
||||
## :movie_camera: GitHub Codespaces
|
||||
|
||||
### Preparing the environment with GitHub Codespaces
|
||||
|
||||
[](https://youtu.be/XOSUt8Ih3zA&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=15)
|
||||
|
||||
# Homework
|
||||
|
||||
* [Homework](../cohorts/2024/01-docker-terraform/homework.md)
|
||||
|
||||
|
||||
# Community notes
|
||||
|
||||
Did you take notes? You can share them here
|
||||
|
||||
* [Notes from Alvaro Navas](https://github.com/ziritrion/dataeng-zoomcamp/blob/main/notes/1_intro.md)
|
||||
* [Notes from Abd](https://itnadigital.notion.site/Week-1-Introduction-f18de7e69eb4453594175d0b1334b2f4)
|
||||
* [Notes from Aaron](https://github.com/ABZ-Aaron/DataEngineerZoomCamp/blob/master/week_1_basics_n_setup/README.md)
|
||||
* [Notes from Faisal](https://github.com/FaisalMohd/data-engineering-zoomcamp/blob/main/week_1_basics_n_setup/Notes/DE%20Zoomcamp%20Week-1.pdf)
|
||||
* [Michael Harty's Notes](https://github.com/mharty3/data_engineering_zoomcamp_2022/tree/main/week01)
|
||||
* [Blog post from Isaac Kargar](https://kargarisaac.github.io/blog/data%20engineering/jupyter/2022/01/18/data-engineering-w1.html)
|
||||
* [Handwritten Notes By Mahmoud Zaher](https://github.com/zaherweb/DataEngineering/blob/master/week%201.pdf)
|
||||
* [Notes from Candace Williams](https://teacherc.github.io/data-engineering/2023/01/18/zoomcamp1.html)
|
||||
* [Notes from Marcos Torregrosa](https://www.n4gash.com/2023/data-engineering-zoomcamp-semana-1/)
|
||||
* [Notes from Vincenzo Galante](https://binchentso.notion.site/Data-Talks-Club-Data-Engineering-Zoomcamp-8699af8e7ff94ec49e6f9bdec8eb69fd)
|
||||
* [Notes from Victor Padilha](https://github.com/padilha/de-zoomcamp/tree/master/week1)
|
||||
* [Notes from froukje](https://github.com/froukje/de-zoomcamp/blob/main/week_1_basics_n_setup/notes/notes_week_01.md)
|
||||
* [Notes from adamiaonr](https://github.com/adamiaonr/data-engineering-zoomcamp/blob/main/week_1_basics_n_setup/2_docker_sql/NOTES.md)
|
||||
* [Notes from Xia He-Bleinagel](https://xiahe-bleinagel.com/2023/01/week-1-data-engineering-zoomcamp-notes/)
|
||||
* [Notes from Balaji](https://github.com/Balajirvp/DE-Zoomcamp/blob/main/Week%201/Detailed%20Week%201%20Notes.ipynb)
|
||||
* [Notes from Erik](https://twitter.com/ehub96/status/1621351266281730049)
|
||||
* [Notes by Alain Boisvert](https://github.com/boisalai/de-zoomcamp-2023/blob/main/week1.md)
|
||||
* Notes on [Docker, Docker Compose, and setting up a proper Python environment](https://medium.com/@verazabeida/zoomcamp-2023-week-1-f4f94cb360ae), by Vera
|
||||
* [Setting up the development environment on Google Virtual Machine](https://itsadityagupta.hashnode.dev/setting-up-the-development-environment-on-google-virtual-machine), blog post by Aditya Gupta
|
||||
* [Notes from Zharko Cekovski](https://www.zharconsulting.com/contents/data/data-engineering-bootcamp-2024/week-1-postgres-docker-and-ingestion-scripts/)
|
||||
* [2024 Module-01 Walkthough video by ellacharmed on youtube](https://youtu.be/VUZshlVAnk4)
|
||||
* [2024 Companion Module Walkthough slides by ellacharmed](https://github.com/ellacharmed/data-engineering-zoomcamp/blob/ella2024/cohorts/2024/01-docker-terraform/walkthrough-01.pdf)
|
||||
* [2024 Module-01 Environment setup video by ellacharmed on youtube](https://youtu.be/Zce_Hd37NGs)
|
||||
* [Docker Notes by Linda](https://github.com/inner-outer-space/de-zoomcamp-2024/blob/main/1a-docker_sql/readme.md) • [Terraform Notes by Linda](https://github.com/inner-outer-space/de-zoomcamp-2024/blob/main/1b-terraform_gcp/readme.md)
|
||||
* [Notes from Hammad Tariq](https://github.com/hamad-tariq/HammadTariq-ZoomCamp2024/blob/9c8b4908416eb8cade3d7ec220e7664c003e9b11/week_1_basics_n_setup/README.md)
|
||||
* [Hung's Notes](https://hung.bearblog.dev/docker/) & [Docker Cheatsheet](https://github.com/HangenYuu/docker-cheatsheet)
|
||||
* [Kemal's Notes](https://github.com/kemaldahha/data-engineering-course/blob/main/week_1_notes.md)
|
||||
* [Notes from Manuel Guerra (Windows+WSL2 Environment)](https://github.com/ManuelGuerra1987/data-engineering-zoomcamp-notes/blob/main/1_Containerization-and-Infrastructure-as-Code/README.md)
|
||||
* [Notes from Horeb SEIDOU](https://www.notion.so/Week-1-Containerization-and-Infrastructure-as-Code-15729780dc4a80a08288e497ba937a37?pvs=4)
|
||||
* Add your notes above this line
|
||||
306
02-workflow-orchestration/README.md
Normal file
306
02-workflow-orchestration/README.md
Normal file
@ -0,0 +1,306 @@
|
||||
# Week 2: Workflow Orchestration
|
||||
|
||||
Welcome to Week 2 of the Data Engineering Zoomcamp! This week, we’ll dive into workflow orchestration using [Kestra](https://go.kestra.io/de-zoomcamp/github).
|
||||
|
||||
Kestra is an open-source, event-driven orchestration platform that simplifies building both scheduled and event-driven workflows. By adopting Infrastructure as Code practices for data and process orchestration, Kestra enables you to build reliable workflows with just a few lines of YAML.
|
||||
|
||||
> [!NOTE]
|
||||
>You can find all videos for this week in this [YouTube Playlist](https://go.kestra.io/de-zoomcamp/yt-playlist).
|
||||
|
||||
---
|
||||
|
||||
# Course Structure
|
||||
|
||||
## 1. Conceptual Material: Introduction to Orchestration and Kestra
|
||||
|
||||
In this section, you’ll learn the foundations of workflow orchestration, its importance, and how Kestra fits into the orchestration landscape.
|
||||
|
||||
### Videos
|
||||
- **2.2.1 - Introduction to Workflow Orchestration**
|
||||
[](https://youtu.be/Np6QmmcgLCs)
|
||||
|
||||
- **2.2.2 - Learn the Concepts of Kestra**
|
||||
[](https://youtu.be/o79n-EVpics)
|
||||
|
||||
### Resources
|
||||
- [Quickstart Guide](https://go.kestra.io/de-zoomcamp/quickstart)
|
||||
- [Install Kestra with Docker Compose](https://go.kestra.io/de-zoomcamp/docker-compose)
|
||||
- [Tutorial](https://go.kestra.io/de-zoomcamp/tutorial)
|
||||
- [What is an Orchestrator?](https://go.kestra.io/de-zoomcamp/what-is-an-orchestrator)
|
||||
|
||||
---
|
||||
|
||||
## 2. Hands-On Coding Project: Build Data Pipelines with Kestra
|
||||
|
||||
This week, we're gonna build ETL pipelines for Yellow and Green Taxi data from NYC’s Taxi and Limousine Commission (TLC). You will:
|
||||
1. Extract data from [CSV files](https://github.com/DataTalksClub/nyc-tlc-data/releases).
|
||||
2. Load it into Postgres or Google Cloud (GCS + BigQuery).
|
||||
3. Explore scheduling and backfilling workflows.
|
||||
|
||||
### File Structure
|
||||
|
||||
The project is organized as follows:
|
||||
```
|
||||
.
|
||||
├── flows/
|
||||
│ ├── 01_getting_started_data_pipeline.yaml
|
||||
│ ├── 02_postgres_taxi.yaml
|
||||
│ ├── 02_postgres_taxi_scheduled.yaml
|
||||
│ ├── 03_postgres_dbt.yaml
|
||||
│ ├── 04_gcp_kv.yaml
|
||||
│ ├── 05_gcp_setup.yaml
|
||||
│ ├── 06_gcp_taxi.yaml
|
||||
│ ├── 06_gcp_taxi_scheduled.yaml
|
||||
│ └── 07_gcp_dbt.yaml
|
||||
```
|
||||
|
||||
### Setup Kestra
|
||||
|
||||
We'll set up Kestra using Docker Compose containing one container for the Kestra server and another for the Postgres database:
|
||||
|
||||
```bash
|
||||
cd 02-workflow-orchestration/
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
Once the container starts, you can access the Kestra UI at [http://localhost:8080](http://localhost:8080).
|
||||
|
||||
If you prefer to add flows programmatically using Kestra's API, run the following commands:
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8080/api/v1/flows/import -F fileUpload=@flows/01_getting_started_data_pipeline.yaml
|
||||
curl -X POST http://localhost:8080/api/v1/flows/import -F fileUpload=@flows/02_postgres_taxi.yaml
|
||||
curl -X POST http://localhost:8080/api/v1/flows/import -F fileUpload=@flows/02_postgres_taxi_scheduled.yaml
|
||||
curl -X POST http://localhost:8080/api/v1/flows/import -F fileUpload=@flows/03_postgres_dbt.yaml
|
||||
curl -X POST http://localhost:8080/api/v1/flows/import -F fileUpload=@flows/04_gcp_kv.yaml
|
||||
curl -X POST http://localhost:8080/api/v1/flows/import -F fileUpload=@flows/05_gcp_setup.yaml
|
||||
curl -X POST http://localhost:8080/api/v1/flows/import -F fileUpload=@flows/06_gcp_taxi.yaml
|
||||
curl -X POST http://localhost:8080/api/v1/flows/import -F fileUpload=@flows/06_gcp_taxi_scheduled.yaml
|
||||
curl -X POST http://localhost:8080/api/v1/flows/import -F fileUpload=@flows/07_gcp_dbt.yaml
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. ETL Pipelines in Kestra: Detailed Walkthrough
|
||||
|
||||
### Getting Started Pipeline
|
||||
|
||||
This introductory flow is added just to demonstrate a simple data pipeline which extracts data via HTTP REST API, transforms that data in Python and then queries it using DuckDB.
|
||||
|
||||
### Videos
|
||||
|
||||
- **2.2.3 - Create an ETL Pipeline with Postgres in Kestra**
|
||||
[](https://youtu.be/OkfLX28Ecjg?si=vKbIyWo1TtjpNnvt)
|
||||
- **2.2.4 - Manage Scheduling and Backfills using Postgres in Kestra**
|
||||
[](https://youtu.be/_-li_z97zog?si=G6jZbkfJb3GAyqrd)
|
||||
- **2.2.5 - Transform Data with dbt and Postgres in Kestra**
|
||||
[](https://youtu.be/ZLp2N6p2JjE?si=tWhcvq5w4lO8v1_p)
|
||||
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
Extract[Extract Data via HTTP REST API] --> Transform[Transform Data in Python]
|
||||
Transform --> Query[Query Data with DuckDB]
|
||||
```
|
||||
|
||||
Add the flow [`01_getting_started_data_pipeline.yaml`](flows/01_getting_started_data_pipeline.yaml) from the UI if you haven't already and execute it to see the results. Inspect the Gantt and Logs tabs to understand the flow execution.
|
||||
|
||||
### Local DB: Load Taxi Data to Postgres
|
||||
|
||||
Before we start loading data to GCP, we'll first play with the Yellow and Green Taxi data using a local Postgres database running in a Docker container. We'll create a new Postgres database for these examples using this [Docker Compose file](postgres/docker-compose.yml). Download it into a new directory, navigate to it and run the following command to start it:
|
||||
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
The flow will extract CSV data partitioned by year and month, create tables, load data to the monthly table, and finally merge the data to the final destination table.
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
Start[Select Year & Month] --> SetLabel[Set Labels]
|
||||
SetLabel --> Extract[Extract CSV Data]
|
||||
Extract -->|Taxi=Yellow| YellowFinalTable[Create Yellow Final Table]:::yellow
|
||||
Extract -->|Taxi=Green| GreenFinalTable[Create Green Final Table]:::green
|
||||
YellowFinalTable --> YellowMonthlyTable[Create Yellow Monthly Table]:::yellow
|
||||
GreenFinalTable --> GreenMonthlyTable[Create Green Monthly Table]:::green
|
||||
YellowMonthlyTable --> YellowCopyIn[Load Data to Monthly Table]:::yellow
|
||||
GreenMonthlyTable --> GreenCopyIn[Load Data to Monthly Table]:::green
|
||||
YellowCopyIn --> YellowMerge[Merge Yellow Data]:::yellow
|
||||
GreenCopyIn --> GreenMerge[Merge Green Data]:::green
|
||||
|
||||
classDef yellow fill:#FFD700,stroke:#000,stroke-width:1px;
|
||||
classDef green fill:#32CD32,stroke:#000,stroke-width:1px;
|
||||
```
|
||||
|
||||
The flow code: [`02_postgres_taxi.yaml`](flows/02_postgres_taxi.yaml).
|
||||
|
||||
|
||||
> [!NOTE]
|
||||
> The NYC Taxi and Limousine Commission (TLC) Trip Record Data provided on the [nyc.gov](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page) website is currently available only in a Parquet format, but this is NOT the dataset we're going to use in this course. For the purpose of this course, we'll use the **CSV files** available [here on GitHub](https://github.com/DataTalksClub/nyc-tlc-data/releases). This is because the Parquet format can be challenging to understand by newcomers, and we want to make the course as accessible as possible — the CSV format can be easily introspected using tools like Excel or Google Sheets, or even a simple text editor.
|
||||
|
||||
### Local DB: Learn Scheduling and Backfills
|
||||
|
||||
We can now schedule the same pipeline shown above to run daily at 9 AM UTC. We'll also demonstrate how to backfill the data pipeline to run on historical data.
|
||||
|
||||
Note: given the large dataset, we'll backfill only data for the green taxi dataset for the year 2019.
|
||||
|
||||
The flow code: [`02_postgres_taxi_scheduled.yaml`](flows/02_postgres_taxi_scheduled.yaml).
|
||||
|
||||
### Local DB: Orchestrate dbt Models
|
||||
|
||||
Now that we have raw data ingested into a local Postgres database, we can use dbt to transform the data into meaningful insights. The flow will sync the dbt models from Git to Kestra and run the `dbt build` command to build the models.
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
Start[Select dbt command] --> Sync[Sync Namespace Files]
|
||||
Sync --> DbtBuild[Run dbt CLI]
|
||||
```
|
||||
|
||||
The flow code: [`03_postgres_dbt.yaml`](flows/03_postgres_dbt.yaml).
|
||||
|
||||
### Resources
|
||||
- [pgAdmin Download](https://www.pgadmin.org/download/)
|
||||
- [Postgres DB Docker Compose](postgres/docker-compose.yml)
|
||||
|
||||
---
|
||||
|
||||
## 4. ETL Pipelines in Kestra: Google Cloud Platform
|
||||
|
||||
Now that you've learned how to build ETL pipelines locally using Postgres, we are ready to move to the cloud. In this section, we'll load the same Yellow and Green Taxi data to Google Cloud Platform (GCP) using:
|
||||
1. Google Cloud Storage (GCS) as a data lake
|
||||
2. BigQuery as a data warehouse.
|
||||
|
||||
### Videos
|
||||
|
||||
- **2.2.6 - Create an ETL Pipeline with GCS and BigQuery in Kestra**
|
||||
[](https://youtu.be/nKqjjLJ7YXs)
|
||||
- **2.2.7 - Manage Scheduling and Backfills using BigQuery in Kestra**
|
||||
[](https://youtu.be/DoaZ5JWEkH0)
|
||||
- **2.2.8 - Transform Data with dbt and BigQuery in Kestra**
|
||||
[](https://youtu.be/eF_EdV4A1Wk)
|
||||
|
||||
### Setup Google Cloud Platform (GCP)
|
||||
|
||||
Before we start loading data to GCP, we need to set up the Google Cloud Platform.
|
||||
|
||||
First, adjust the following flow [`04_gcp_kv.yaml`](flows/04_gcp_kv.yaml) to include your service account, GCP project ID, BigQuery dataset and GCS bucket name (_along with their location_) as KV Store values:
|
||||
- GCP_CREDS
|
||||
- GCP_PROJECT_ID
|
||||
- GCP_LOCATION
|
||||
- GCP_BUCKET_NAME
|
||||
- GCP_DATASET.
|
||||
|
||||
|
||||
> [!WARNING]
|
||||
> The `GCP_CREDS` service account contains sensitive information. Ensure you keep it secure and do not commit it to Git. Keep it as secure as your passwords.
|
||||
|
||||
### Create GCP Resources
|
||||
|
||||
If you haven't already created the GCS bucket and BigQuery dataset in the first week of the course, you can use this flow to create them: [`05_gcp_setup.yaml`](flows/05_gcp_setup.yaml).
|
||||
|
||||
|
||||
### GCP Workflow: Load Taxi Data to BigQuery
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
SetLabel[Set Labels] --> Extract[Extract CSV Data]
|
||||
Extract --> UploadToGCS[Upload Data to GCS]
|
||||
UploadToGCS -->|Taxi=Yellow| BQYellowTripdata[Main Yellow Tripdata Table]:::yellow
|
||||
UploadToGCS -->|Taxi=Green| BQGreenTripdata[Main Green Tripdata Table]:::green
|
||||
BQYellowTripdata --> BQYellowTableExt[External Table]:::yellow
|
||||
BQGreenTripdata --> BQGreenTableExt[External Table]:::green
|
||||
BQYellowTableExt --> BQYellowTableTmp[Monthly Table]:::yellow
|
||||
BQGreenTableExt --> BQGreenTableTmp[Monthly Table]:::green
|
||||
BQYellowTableTmp --> BQYellowMerge[Merge to Main Table]:::yellow
|
||||
BQGreenTableTmp --> BQGreenMerge[Merge to Main Table]:::green
|
||||
BQYellowMerge --> PurgeFiles[Purge Files]
|
||||
BQGreenMerge --> PurgeFiles[Purge Files]
|
||||
|
||||
classDef yellow fill:#FFD700,stroke:#000,stroke-width:1px;
|
||||
classDef green fill:#32CD32,stroke:#000,stroke-width:1px;
|
||||
```
|
||||
|
||||
The flow code: [`06_gcp_taxi.yaml`](flows/06_gcp_taxi.yaml).
|
||||
|
||||
### GCP Workflow: Schedule and Backfill Full Dataset
|
||||
|
||||
We can now schedule the same pipeline shown above to run daily at 9 AM UTC for the green dataset and at 10 AM UTC for the yellow dataset. You can backfill historical data directly from the Kestra UI.
|
||||
|
||||
Since we now process data in a cloud environment with infinitely scalable storage and compute, we can backfill the entire dataset for both the yellow and green taxi data without the risk of running out of resources on our local machine.
|
||||
|
||||
The flow code: [`06_gcp_taxi_scheduled.yaml`](flows/06_gcp_taxi_scheduled.yaml).
|
||||
|
||||
### GCP Workflow: Orchestrate dbt Models
|
||||
|
||||
Now that we have raw data ingested into BigQuery, we can use dbt to transform that data. The flow will sync the dbt models from Git to Kestra and run the `dbt build` command to build the models:
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
Start[Select dbt command] --> Sync[Sync Namespace Files]
|
||||
Sync --> Build[Run dbt Build Command]
|
||||
```
|
||||
|
||||
The flow code: [`07_gcp_dbt.yaml`](flows/07_gcp_dbt.yaml).
|
||||
|
||||
---
|
||||
|
||||
## 5. Bonus: Deploy to the Cloud
|
||||
|
||||
Now that we've got our ETL pipeline working both locally and in the cloud, we can deploy Kestra to the cloud so it can continue to orchestrate our ETL pipelines monthly with our configured schedules, We'll cover how you can install Kestra on Google Cloud in Production, and automatically sync and deploy your workflows from a Git repository.
|
||||
|
||||
### Videos
|
||||
|
||||
- **2.2.9 - Deploy Workflows to the Cloud with Git**
|
||||
[](https://youtu.be/l-wC71tI3co)
|
||||
|
||||
Resources
|
||||
|
||||
- [Install Kestra on Google Cloud](https://go.kestra.io/de-zoomcamp/gcp-install)
|
||||
- [Moving from Development to Production](https://go.kestra.io/de-zoomcamp/dev-to-prod)
|
||||
- [Using Git in Kestra](https://go.kestra.io/de-zoomcamp/git)
|
||||
- [Deploy Flows with GitHub Actions](https://go.kestra.io/de-zoomcamp/deploy-github-actions)
|
||||
|
||||
## 6. Additional Resources 📚
|
||||
|
||||
- Check [Kestra Docs](https://go.kestra.io/de-zoomcamp/docs)
|
||||
- Explore our [Blueprints](https://go.kestra.io/de-zoomcamp/blueprints) library
|
||||
- Browse over 600 [plugins](https://go.kestra.io/de-zoomcamp/plugins) available in Kestra
|
||||
- Give us a star on [GitHub](https://go.kestra.io/de-zoomcamp/github)
|
||||
- Join our [Slack community](https://go.kestra.io/de-zoomcamp/slack) if you have any questions
|
||||
- Find all the videos in this [YouTube Playlist](https://go.kestra.io/de-zoomcamp/yt-playlist)
|
||||
|
||||
|
||||
### Troubleshooting tips
|
||||
|
||||
If you encounter similar errors to:
|
||||
|
||||
```
|
||||
BigQueryError{reason=invalid, location=null,
|
||||
message=Error while reading table: kestra-sandbox.zooomcamp.yellow_tripdata_2020_01,
|
||||
error message: CSV table references column position 17, but line contains only 14 columns.;
|
||||
line_number: 2103925 byte_offset_to_start_of_line: 194863028
|
||||
column_index: 17 column_name: "congestion_surcharge" column_type: NUMERIC
|
||||
File: gs://anna-geller/yellow_tripdata_2020-01.csv}
|
||||
```
|
||||
|
||||
It means that the CSV file you're trying to load into BigQuery has a mismatch in the number of columns between the external source table (i.e. file in GCS) and the destination table in BigQuery. This can happen when for due to network/transfer issues, the file is not fully downloaded from GitHub or not correctly uploaded to GCS. The error suggests schema issues but that's not the case. Simply rerun the entire execution including redownloading the CSV file and reuploading it to GCS. This should resolve the issue.
|
||||
|
||||
|
||||
---
|
||||
|
||||
# Community notes
|
||||
|
||||
Did you take notes? You can share them by creating a PR to this file!
|
||||
|
||||
* [Notes from Manuel Guerra)](https://github.com/ManuelGuerra1987/data-engineering-zoomcamp-notes/blob/main/2_Workflow-Orchestration-(Kestra)/README.md)
|
||||
* [Notes from Horeb Seidou](https://www.notion.so/Week-2-Workflow-Orchestration-17129780dc4a80148debf61e6453fffe?pvs=4)
|
||||
* Add your notes above this line
|
||||
|
||||
---
|
||||
|
||||
# Previous Cohorts
|
||||
|
||||
* 2022: [notes](../../2022/week_2_data_ingestion#community-notes) and [videos](../../2022/week_2_data_ingestion/)
|
||||
* 2023: [notes](../../2023/week_2_workflow_orchestration#community-notes) and [videos](../../2023/week_2_workflow_orchestration/)
|
||||
* 2024: [notes](../../2024/02-workflow-orchestration#community-notes) and [videos](../../2024/02-workflow-orchestration/)
|
||||
|
||||
62
02-workflow-orchestration/docker-compose.yml
Normal file
62
02-workflow-orchestration/docker-compose.yml
Normal file
@ -0,0 +1,62 @@
|
||||
volumes:
|
||||
postgres-data:
|
||||
driver: local
|
||||
kestra-data:
|
||||
driver: local
|
||||
|
||||
services:
|
||||
postgres:
|
||||
image: postgres
|
||||
volumes:
|
||||
- postgres-data:/var/lib/postgresql/data
|
||||
environment:
|
||||
POSTGRES_DB: kestra
|
||||
POSTGRES_USER: kestra
|
||||
POSTGRES_PASSWORD: k3str4
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -d $${POSTGRES_DB} -U $${POSTGRES_USER}"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 10
|
||||
|
||||
kestra:
|
||||
image: kestra/kestra:develop
|
||||
pull_policy: always
|
||||
user: "root"
|
||||
command: server standalone
|
||||
volumes:
|
||||
- kestra-data:/app/storage
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
- /tmp/kestra-wd:/tmp/kestra-wd
|
||||
environment:
|
||||
KESTRA_CONFIGURATION: |
|
||||
datasources:
|
||||
postgres:
|
||||
url: jdbc:postgresql://postgres:5432/kestra
|
||||
driverClassName: org.postgresql.Driver
|
||||
username: kestra
|
||||
password: k3str4
|
||||
kestra:
|
||||
server:
|
||||
basicAuth:
|
||||
enabled: false
|
||||
username: "admin@kestra.io" # it must be a valid email address
|
||||
password: kestra
|
||||
repository:
|
||||
type: postgres
|
||||
storage:
|
||||
type: local
|
||||
local:
|
||||
basePath: "/app/storage"
|
||||
queue:
|
||||
type: postgres
|
||||
tasks:
|
||||
tmpDir:
|
||||
path: /tmp/kestra-wd/tmp
|
||||
url: http://localhost:8080/
|
||||
ports:
|
||||
- "8080:8080"
|
||||
- "8081:8081"
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_started
|
||||
@ -0,0 +1,55 @@
|
||||
id: 01_getting_started_data_pipeline
|
||||
namespace: zoomcamp
|
||||
|
||||
inputs:
|
||||
- id: columns_to_keep
|
||||
type: ARRAY
|
||||
itemType: STRING
|
||||
defaults:
|
||||
- brand
|
||||
- price
|
||||
|
||||
tasks:
|
||||
- id: extract
|
||||
type: io.kestra.plugin.core.http.Download
|
||||
uri: https://dummyjson.com/products
|
||||
|
||||
- id: transform
|
||||
type: io.kestra.plugin.scripts.python.Script
|
||||
containerImage: python:3.11-alpine
|
||||
inputFiles:
|
||||
data.json: "{{outputs.extract.uri}}"
|
||||
outputFiles:
|
||||
- "*.json"
|
||||
env:
|
||||
COLUMNS_TO_KEEP: "{{inputs.columns_to_keep}}"
|
||||
script: |
|
||||
import json
|
||||
import os
|
||||
|
||||
columns_to_keep_str = os.getenv("COLUMNS_TO_KEEP")
|
||||
columns_to_keep = json.loads(columns_to_keep_str)
|
||||
|
||||
with open("data.json", "r") as file:
|
||||
data = json.load(file)
|
||||
|
||||
filtered_data = [
|
||||
{column: product.get(column, "N/A") for column in columns_to_keep}
|
||||
for product in data["products"]
|
||||
]
|
||||
|
||||
with open("products.json", "w") as file:
|
||||
json.dump(filtered_data, file, indent=4)
|
||||
|
||||
- id: query
|
||||
type: io.kestra.plugin.jdbc.duckdb.Query
|
||||
inputFiles:
|
||||
products.json: "{{outputs.transform.outputFiles['products.json']}}"
|
||||
sql: |
|
||||
INSTALL json;
|
||||
LOAD json;
|
||||
SELECT brand, round(avg(price), 2) as avg_price
|
||||
FROM read_json_auto('{{workingDir}}/products.json')
|
||||
GROUP BY brand
|
||||
ORDER BY avg_price DESC;
|
||||
fetchType: STORE
|
||||
270
02-workflow-orchestration/flows/02_postgres_taxi.yaml
Normal file
270
02-workflow-orchestration/flows/02_postgres_taxi.yaml
Normal file
@ -0,0 +1,270 @@
|
||||
id: 02_postgres_taxi
|
||||
namespace: zoomcamp
|
||||
description: |
|
||||
The CSV Data used in the course: https://github.com/DataTalksClub/nyc-tlc-data/releases
|
||||
|
||||
inputs:
|
||||
- id: taxi
|
||||
type: SELECT
|
||||
displayName: Select taxi type
|
||||
values: [yellow, green]
|
||||
defaults: yellow
|
||||
|
||||
- id: year
|
||||
type: SELECT
|
||||
displayName: Select year
|
||||
values: ["2019", "2020"]
|
||||
defaults: "2019"
|
||||
|
||||
- id: month
|
||||
type: SELECT
|
||||
displayName: Select month
|
||||
values: ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
|
||||
defaults: "01"
|
||||
|
||||
variables:
|
||||
file: "{{inputs.taxi}}_tripdata_{{inputs.year}}-{{inputs.month}}.csv"
|
||||
staging_table: "public.{{inputs.taxi}}_tripdata_staging"
|
||||
table: "public.{{inputs.taxi}}_tripdata"
|
||||
data: "{{outputs.extract.outputFiles[inputs.taxi ~ '_tripdata_' ~ inputs.year ~ '-' ~ inputs.month ~ '.csv']}}"
|
||||
|
||||
tasks:
|
||||
- id: set_label
|
||||
type: io.kestra.plugin.core.execution.Labels
|
||||
labels:
|
||||
file: "{{render(vars.file)}}"
|
||||
taxi: "{{inputs.taxi}}"
|
||||
|
||||
- id: extract
|
||||
type: io.kestra.plugin.scripts.shell.Commands
|
||||
outputFiles:
|
||||
- "*.csv"
|
||||
taskRunner:
|
||||
type: io.kestra.plugin.core.runner.Process
|
||||
commands:
|
||||
- wget -qO- https://github.com/DataTalksClub/nyc-tlc-data/releases/download/{{inputs.taxi}}/{{render(vars.file)}}.gz | gunzip > {{render(vars.file)}}
|
||||
|
||||
- id: if_yellow_taxi
|
||||
type: io.kestra.plugin.core.flow.If
|
||||
condition: "{{inputs.taxi == 'yellow'}}"
|
||||
then:
|
||||
- id: yellow_create_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS {{render(vars.table)}} (
|
||||
unique_row_id text,
|
||||
filename text,
|
||||
VendorID text,
|
||||
tpep_pickup_datetime timestamp,
|
||||
tpep_dropoff_datetime timestamp,
|
||||
passenger_count integer,
|
||||
trip_distance double precision,
|
||||
RatecodeID text,
|
||||
store_and_fwd_flag text,
|
||||
PULocationID text,
|
||||
DOLocationID text,
|
||||
payment_type integer,
|
||||
fare_amount double precision,
|
||||
extra double precision,
|
||||
mta_tax double precision,
|
||||
tip_amount double precision,
|
||||
tolls_amount double precision,
|
||||
improvement_surcharge double precision,
|
||||
total_amount double precision,
|
||||
congestion_surcharge double precision
|
||||
);
|
||||
|
||||
- id: yellow_create_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS {{render(vars.staging_table)}} (
|
||||
unique_row_id text,
|
||||
filename text,
|
||||
VendorID text,
|
||||
tpep_pickup_datetime timestamp,
|
||||
tpep_dropoff_datetime timestamp,
|
||||
passenger_count integer,
|
||||
trip_distance double precision,
|
||||
RatecodeID text,
|
||||
store_and_fwd_flag text,
|
||||
PULocationID text,
|
||||
DOLocationID text,
|
||||
payment_type integer,
|
||||
fare_amount double precision,
|
||||
extra double precision,
|
||||
mta_tax double precision,
|
||||
tip_amount double precision,
|
||||
tolls_amount double precision,
|
||||
improvement_surcharge double precision,
|
||||
total_amount double precision,
|
||||
congestion_surcharge double precision
|
||||
);
|
||||
|
||||
- id: yellow_truncate_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
TRUNCATE TABLE {{render(vars.staging_table)}};
|
||||
|
||||
- id: yellow_copy_in_to_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.CopyIn
|
||||
format: CSV
|
||||
from: "{{render(vars.data)}}"
|
||||
table: "{{render(vars.staging_table)}}"
|
||||
header: true
|
||||
columns: [VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge]
|
||||
|
||||
- id: yellow_add_unique_id_and_filename
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
UPDATE {{render(vars.staging_table)}}
|
||||
SET
|
||||
unique_row_id = md5(
|
||||
COALESCE(CAST(VendorID AS text), '') ||
|
||||
COALESCE(CAST(tpep_pickup_datetime AS text), '') ||
|
||||
COALESCE(CAST(tpep_dropoff_datetime AS text), '') ||
|
||||
COALESCE(PULocationID, '') ||
|
||||
COALESCE(DOLocationID, '') ||
|
||||
COALESCE(CAST(fare_amount AS text), '') ||
|
||||
COALESCE(CAST(trip_distance AS text), '')
|
||||
),
|
||||
filename = '{{render(vars.file)}}';
|
||||
|
||||
- id: yellow_merge_data
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
MERGE INTO {{render(vars.table)}} AS T
|
||||
USING {{render(vars.staging_table)}} AS S
|
||||
ON T.unique_row_id = S.unique_row_id
|
||||
WHEN NOT MATCHED THEN
|
||||
INSERT (
|
||||
unique_row_id, filename, VendorID, tpep_pickup_datetime, tpep_dropoff_datetime,
|
||||
passenger_count, trip_distance, RatecodeID, store_and_fwd_flag, PULocationID,
|
||||
DOLocationID, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount,
|
||||
improvement_surcharge, total_amount, congestion_surcharge
|
||||
)
|
||||
VALUES (
|
||||
S.unique_row_id, S.filename, S.VendorID, S.tpep_pickup_datetime, S.tpep_dropoff_datetime,
|
||||
S.passenger_count, S.trip_distance, S.RatecodeID, S.store_and_fwd_flag, S.PULocationID,
|
||||
S.DOLocationID, S.payment_type, S.fare_amount, S.extra, S.mta_tax, S.tip_amount, S.tolls_amount,
|
||||
S.improvement_surcharge, S.total_amount, S.congestion_surcharge
|
||||
);
|
||||
|
||||
- id: if_green_taxi
|
||||
type: io.kestra.plugin.core.flow.If
|
||||
condition: "{{inputs.taxi == 'green'}}"
|
||||
then:
|
||||
- id: green_create_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS {{render(vars.table)}} (
|
||||
unique_row_id text,
|
||||
filename text,
|
||||
VendorID text,
|
||||
lpep_pickup_datetime timestamp,
|
||||
lpep_dropoff_datetime timestamp,
|
||||
store_and_fwd_flag text,
|
||||
RatecodeID text,
|
||||
PULocationID text,
|
||||
DOLocationID text,
|
||||
passenger_count integer,
|
||||
trip_distance double precision,
|
||||
fare_amount double precision,
|
||||
extra double precision,
|
||||
mta_tax double precision,
|
||||
tip_amount double precision,
|
||||
tolls_amount double precision,
|
||||
ehail_fee double precision,
|
||||
improvement_surcharge double precision,
|
||||
total_amount double precision,
|
||||
payment_type integer,
|
||||
trip_type integer,
|
||||
congestion_surcharge double precision
|
||||
);
|
||||
|
||||
- id: green_create_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS {{render(vars.staging_table)}} (
|
||||
unique_row_id text,
|
||||
filename text,
|
||||
VendorID text,
|
||||
lpep_pickup_datetime timestamp,
|
||||
lpep_dropoff_datetime timestamp,
|
||||
store_and_fwd_flag text,
|
||||
RatecodeID text,
|
||||
PULocationID text,
|
||||
DOLocationID text,
|
||||
passenger_count integer,
|
||||
trip_distance double precision,
|
||||
fare_amount double precision,
|
||||
extra double precision,
|
||||
mta_tax double precision,
|
||||
tip_amount double precision,
|
||||
tolls_amount double precision,
|
||||
ehail_fee double precision,
|
||||
improvement_surcharge double precision,
|
||||
total_amount double precision,
|
||||
payment_type integer,
|
||||
trip_type integer,
|
||||
congestion_surcharge double precision
|
||||
);
|
||||
|
||||
- id: green_truncate_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
TRUNCATE TABLE {{render(vars.staging_table)}};
|
||||
|
||||
- id: green_copy_in_to_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.CopyIn
|
||||
format: CSV
|
||||
from: "{{render(vars.data)}}"
|
||||
table: "{{render(vars.staging_table)}}"
|
||||
header: true
|
||||
columns: [VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge]
|
||||
|
||||
- id: green_add_unique_id_and_filename
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
UPDATE {{render(vars.staging_table)}}
|
||||
SET
|
||||
unique_row_id = md5(
|
||||
COALESCE(CAST(VendorID AS text), '') ||
|
||||
COALESCE(CAST(lpep_pickup_datetime AS text), '') ||
|
||||
COALESCE(CAST(lpep_dropoff_datetime AS text), '') ||
|
||||
COALESCE(PULocationID, '') ||
|
||||
COALESCE(DOLocationID, '') ||
|
||||
COALESCE(CAST(fare_amount AS text), '') ||
|
||||
COALESCE(CAST(trip_distance AS text), '')
|
||||
),
|
||||
filename = '{{render(vars.file)}}';
|
||||
|
||||
- id: green_merge_data
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
MERGE INTO {{render(vars.table)}} AS T
|
||||
USING {{render(vars.staging_table)}} AS S
|
||||
ON T.unique_row_id = S.unique_row_id
|
||||
WHEN NOT MATCHED THEN
|
||||
INSERT (
|
||||
unique_row_id, filename, VendorID, lpep_pickup_datetime, lpep_dropoff_datetime,
|
||||
store_and_fwd_flag, RatecodeID, PULocationID, DOLocationID, passenger_count,
|
||||
trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee,
|
||||
improvement_surcharge, total_amount, payment_type, trip_type, congestion_surcharge
|
||||
)
|
||||
VALUES (
|
||||
S.unique_row_id, S.filename, S.VendorID, S.lpep_pickup_datetime, S.lpep_dropoff_datetime,
|
||||
S.store_and_fwd_flag, S.RatecodeID, S.PULocationID, S.DOLocationID, S.passenger_count,
|
||||
S.trip_distance, S.fare_amount, S.extra, S.mta_tax, S.tip_amount, S.tolls_amount, S.ehail_fee,
|
||||
S.improvement_surcharge, S.total_amount, S.payment_type, S.trip_type, S.congestion_surcharge
|
||||
);
|
||||
|
||||
- id: purge_files
|
||||
type: io.kestra.plugin.core.storage.PurgeCurrentExecutionFiles
|
||||
description: This will remove output files. If you'd like to explore Kestra outputs, disable it.
|
||||
|
||||
pluginDefaults:
|
||||
- type: io.kestra.plugin.jdbc.postgresql
|
||||
values:
|
||||
url: jdbc:postgresql://host.docker.internal:5432/postgres-zoomcamp
|
||||
username: kestra
|
||||
password: k3str4
|
||||
275
02-workflow-orchestration/flows/02_postgres_taxi_scheduled.yaml
Normal file
275
02-workflow-orchestration/flows/02_postgres_taxi_scheduled.yaml
Normal file
@ -0,0 +1,275 @@
|
||||
id: 02_postgres_taxi_scheduled
|
||||
namespace: zoomcamp
|
||||
description: |
|
||||
Best to add a label `backfill:true` from the UI to track executions created via a backfill.
|
||||
CSV data used here comes from: https://github.com/DataTalksClub/nyc-tlc-data/releases
|
||||
|
||||
concurrency:
|
||||
limit: 1
|
||||
|
||||
inputs:
|
||||
- id: taxi
|
||||
type: SELECT
|
||||
displayName: Select taxi type
|
||||
values: [yellow, green]
|
||||
defaults: yellow
|
||||
|
||||
variables:
|
||||
file: "{{inputs.taxi}}_tripdata_{{trigger.date | date('yyyy-MM')}}.csv"
|
||||
staging_table: "public.{{inputs.taxi}}_tripdata_staging"
|
||||
table: "public.{{inputs.taxi}}_tripdata"
|
||||
data: "{{outputs.extract.outputFiles[inputs.taxi ~ '_tripdata_' ~ (trigger.date | date('yyyy-MM')) ~ '.csv']}}"
|
||||
|
||||
tasks:
|
||||
- id: set_label
|
||||
type: io.kestra.plugin.core.execution.Labels
|
||||
labels:
|
||||
file: "{{render(vars.file)}}"
|
||||
taxi: "{{inputs.taxi}}"
|
||||
|
||||
- id: extract
|
||||
type: io.kestra.plugin.scripts.shell.Commands
|
||||
outputFiles:
|
||||
- "*.csv"
|
||||
taskRunner:
|
||||
type: io.kestra.plugin.core.runner.Process
|
||||
commands:
|
||||
- wget -qO- https://github.com/DataTalksClub/nyc-tlc-data/releases/download/{{inputs.taxi}}/{{render(vars.file)}}.gz | gunzip > {{render(vars.file)}}
|
||||
|
||||
- id: if_yellow_taxi
|
||||
type: io.kestra.plugin.core.flow.If
|
||||
condition: "{{inputs.taxi == 'yellow'}}"
|
||||
then:
|
||||
- id: yellow_create_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS {{render(vars.table)}} (
|
||||
unique_row_id text,
|
||||
filename text,
|
||||
VendorID text,
|
||||
tpep_pickup_datetime timestamp,
|
||||
tpep_dropoff_datetime timestamp,
|
||||
passenger_count integer,
|
||||
trip_distance double precision,
|
||||
RatecodeID text,
|
||||
store_and_fwd_flag text,
|
||||
PULocationID text,
|
||||
DOLocationID text,
|
||||
payment_type integer,
|
||||
fare_amount double precision,
|
||||
extra double precision,
|
||||
mta_tax double precision,
|
||||
tip_amount double precision,
|
||||
tolls_amount double precision,
|
||||
improvement_surcharge double precision,
|
||||
total_amount double precision,
|
||||
congestion_surcharge double precision
|
||||
);
|
||||
|
||||
- id: yellow_create_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS {{render(vars.staging_table)}} (
|
||||
unique_row_id text,
|
||||
filename text,
|
||||
VendorID text,
|
||||
tpep_pickup_datetime timestamp,
|
||||
tpep_dropoff_datetime timestamp,
|
||||
passenger_count integer,
|
||||
trip_distance double precision,
|
||||
RatecodeID text,
|
||||
store_and_fwd_flag text,
|
||||
PULocationID text,
|
||||
DOLocationID text,
|
||||
payment_type integer,
|
||||
fare_amount double precision,
|
||||
extra double precision,
|
||||
mta_tax double precision,
|
||||
tip_amount double precision,
|
||||
tolls_amount double precision,
|
||||
improvement_surcharge double precision,
|
||||
total_amount double precision,
|
||||
congestion_surcharge double precision
|
||||
);
|
||||
|
||||
- id: yellow_truncate_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
TRUNCATE TABLE {{render(vars.staging_table)}};
|
||||
|
||||
- id: yellow_copy_in_to_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.CopyIn
|
||||
format: CSV
|
||||
from: "{{render(vars.data)}}"
|
||||
table: "{{render(vars.staging_table)}}"
|
||||
header: true
|
||||
columns: [VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge]
|
||||
|
||||
- id: yellow_add_unique_id_and_filename
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
UPDATE {{render(vars.staging_table)}}
|
||||
SET
|
||||
unique_row_id = md5(
|
||||
COALESCE(CAST(VendorID AS text), '') ||
|
||||
COALESCE(CAST(tpep_pickup_datetime AS text), '') ||
|
||||
COALESCE(CAST(tpep_dropoff_datetime AS text), '') ||
|
||||
COALESCE(PULocationID, '') ||
|
||||
COALESCE(DOLocationID, '') ||
|
||||
COALESCE(CAST(fare_amount AS text), '') ||
|
||||
COALESCE(CAST(trip_distance AS text), '')
|
||||
),
|
||||
filename = '{{render(vars.file)}}';
|
||||
|
||||
- id: yellow_merge_data
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
MERGE INTO {{render(vars.table)}} AS T
|
||||
USING {{render(vars.staging_table)}} AS S
|
||||
ON T.unique_row_id = S.unique_row_id
|
||||
WHEN NOT MATCHED THEN
|
||||
INSERT (
|
||||
unique_row_id, filename, VendorID, tpep_pickup_datetime, tpep_dropoff_datetime,
|
||||
passenger_count, trip_distance, RatecodeID, store_and_fwd_flag, PULocationID,
|
||||
DOLocationID, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount,
|
||||
improvement_surcharge, total_amount, congestion_surcharge
|
||||
)
|
||||
VALUES (
|
||||
S.unique_row_id, S.filename, S.VendorID, S.tpep_pickup_datetime, S.tpep_dropoff_datetime,
|
||||
S.passenger_count, S.trip_distance, S.RatecodeID, S.store_and_fwd_flag, S.PULocationID,
|
||||
S.DOLocationID, S.payment_type, S.fare_amount, S.extra, S.mta_tax, S.tip_amount, S.tolls_amount,
|
||||
S.improvement_surcharge, S.total_amount, S.congestion_surcharge
|
||||
);
|
||||
|
||||
- id: if_green_taxi
|
||||
type: io.kestra.plugin.core.flow.If
|
||||
condition: "{{inputs.taxi == 'green'}}"
|
||||
then:
|
||||
- id: green_create_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS {{render(vars.table)}} (
|
||||
unique_row_id text,
|
||||
filename text,
|
||||
VendorID text,
|
||||
lpep_pickup_datetime timestamp,
|
||||
lpep_dropoff_datetime timestamp,
|
||||
store_and_fwd_flag text,
|
||||
RatecodeID text,
|
||||
PULocationID text,
|
||||
DOLocationID text,
|
||||
passenger_count integer,
|
||||
trip_distance double precision,
|
||||
fare_amount double precision,
|
||||
extra double precision,
|
||||
mta_tax double precision,
|
||||
tip_amount double precision,
|
||||
tolls_amount double precision,
|
||||
ehail_fee double precision,
|
||||
improvement_surcharge double precision,
|
||||
total_amount double precision,
|
||||
payment_type integer,
|
||||
trip_type integer,
|
||||
congestion_surcharge double precision
|
||||
);
|
||||
|
||||
- id: green_create_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS {{render(vars.staging_table)}} (
|
||||
unique_row_id text,
|
||||
filename text,
|
||||
VendorID text,
|
||||
lpep_pickup_datetime timestamp,
|
||||
lpep_dropoff_datetime timestamp,
|
||||
store_and_fwd_flag text,
|
||||
RatecodeID text,
|
||||
PULocationID text,
|
||||
DOLocationID text,
|
||||
passenger_count integer,
|
||||
trip_distance double precision,
|
||||
fare_amount double precision,
|
||||
extra double precision,
|
||||
mta_tax double precision,
|
||||
tip_amount double precision,
|
||||
tolls_amount double precision,
|
||||
ehail_fee double precision,
|
||||
improvement_surcharge double precision,
|
||||
total_amount double precision,
|
||||
payment_type integer,
|
||||
trip_type integer,
|
||||
congestion_surcharge double precision
|
||||
);
|
||||
|
||||
- id: green_truncate_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
TRUNCATE TABLE {{render(vars.staging_table)}};
|
||||
|
||||
- id: green_copy_in_to_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.CopyIn
|
||||
format: CSV
|
||||
from: "{{render(vars.data)}}"
|
||||
table: "{{render(vars.staging_table)}}"
|
||||
header: true
|
||||
columns: [VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge]
|
||||
|
||||
- id: green_add_unique_id_and_filename
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
UPDATE {{render(vars.staging_table)}}
|
||||
SET
|
||||
unique_row_id = md5(
|
||||
COALESCE(CAST(VendorID AS text), '') ||
|
||||
COALESCE(CAST(lpep_pickup_datetime AS text), '') ||
|
||||
COALESCE(CAST(lpep_dropoff_datetime AS text), '') ||
|
||||
COALESCE(PULocationID, '') ||
|
||||
COALESCE(DOLocationID, '') ||
|
||||
COALESCE(CAST(fare_amount AS text), '') ||
|
||||
COALESCE(CAST(trip_distance AS text), '')
|
||||
),
|
||||
filename = '{{render(vars.file)}}';
|
||||
|
||||
- id: green_merge_data
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
MERGE INTO {{render(vars.table)}} AS T
|
||||
USING {{render(vars.staging_table)}} AS S
|
||||
ON T.unique_row_id = S.unique_row_id
|
||||
WHEN NOT MATCHED THEN
|
||||
INSERT (
|
||||
unique_row_id, filename, VendorID, lpep_pickup_datetime, lpep_dropoff_datetime,
|
||||
store_and_fwd_flag, RatecodeID, PULocationID, DOLocationID, passenger_count,
|
||||
trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee,
|
||||
improvement_surcharge, total_amount, payment_type, trip_type, congestion_surcharge
|
||||
)
|
||||
VALUES (
|
||||
S.unique_row_id, S.filename, S.VendorID, S.lpep_pickup_datetime, S.lpep_dropoff_datetime,
|
||||
S.store_and_fwd_flag, S.RatecodeID, S.PULocationID, S.DOLocationID, S.passenger_count,
|
||||
S.trip_distance, S.fare_amount, S.extra, S.mta_tax, S.tip_amount, S.tolls_amount, S.ehail_fee,
|
||||
S.improvement_surcharge, S.total_amount, S.payment_type, S.trip_type, S.congestion_surcharge
|
||||
);
|
||||
|
||||
- id: purge_files
|
||||
type: io.kestra.plugin.core.storage.PurgeCurrentExecutionFiles
|
||||
description: To avoid cluttering your storage, we will remove the downloaded files
|
||||
|
||||
pluginDefaults:
|
||||
- type: io.kestra.plugin.jdbc.postgresql
|
||||
values:
|
||||
url: jdbc:postgresql://host.docker.internal:5432/postgres-zoomcamp
|
||||
username: kestra
|
||||
password: k3str4
|
||||
|
||||
triggers:
|
||||
- id: green_schedule
|
||||
type: io.kestra.plugin.core.trigger.Schedule
|
||||
cron: "0 9 1 * *"
|
||||
inputs:
|
||||
taxi: green
|
||||
|
||||
- id: yellow_schedule
|
||||
type: io.kestra.plugin.core.trigger.Schedule
|
||||
cron: "0 10 1 * *"
|
||||
inputs:
|
||||
taxi: yellow
|
||||
59
02-workflow-orchestration/flows/03_postgres_dbt.yaml
Normal file
59
02-workflow-orchestration/flows/03_postgres_dbt.yaml
Normal file
@ -0,0 +1,59 @@
|
||||
id: 03_postgres_dbt
|
||||
namespace: zoomcamp
|
||||
inputs:
|
||||
- id: dbt_command
|
||||
type: SELECT
|
||||
allowCustomValue: true
|
||||
defaults: dbt build
|
||||
values:
|
||||
- dbt build
|
||||
- dbt debug # use when running the first time to validate DB connection
|
||||
tasks:
|
||||
- id: sync
|
||||
type: io.kestra.plugin.git.SyncNamespaceFiles
|
||||
url: https://github.com/DataTalksClub/data-engineering-zoomcamp
|
||||
branch: main
|
||||
namespace: "{{ flow.namespace }}"
|
||||
gitDirectory: 04-analytics-engineering/taxi_rides_ny
|
||||
dryRun: false
|
||||
# disabled: true # this Git Sync is needed only when running it the first time, afterwards the task can be disabled
|
||||
|
||||
- id: dbt-build
|
||||
type: io.kestra.plugin.dbt.cli.DbtCLI
|
||||
env:
|
||||
DBT_DATABASE: postgres-zoomcamp
|
||||
DBT_SCHEMA: public
|
||||
namespaceFiles:
|
||||
enabled: true
|
||||
containerImage: ghcr.io/kestra-io/dbt-postgres:latest
|
||||
taskRunner:
|
||||
type: io.kestra.plugin.scripts.runner.docker.Docker
|
||||
commands:
|
||||
- dbt deps
|
||||
- "{{ inputs.dbt_command }}"
|
||||
storeManifest:
|
||||
key: manifest.json
|
||||
namespace: "{{ flow.namespace }}"
|
||||
profiles: |
|
||||
default:
|
||||
outputs:
|
||||
dev:
|
||||
type: postgres
|
||||
host: host.docker.internal
|
||||
user: kestra
|
||||
password: k3str4
|
||||
port: 5432
|
||||
dbname: postgres-zoomcamp
|
||||
schema: public
|
||||
threads: 8
|
||||
connect_timeout: 10
|
||||
priority: interactive
|
||||
target: dev
|
||||
description: |
|
||||
Note that you need to adjust the models/staging/schema.yml file to match your database and schema. Select and edit that Namespace File from the UI. Save and run this flow. Once https://github.com/DataTalksClub/data-engineering-zoomcamp/pull/565/files is merged, you can ignore this note as it will be dynamically adjusted based on env variables.
|
||||
```yaml
|
||||
sources:
|
||||
- name: staging
|
||||
database: postgres-zoomcamp
|
||||
schema: public
|
||||
```
|
||||
37
02-workflow-orchestration/flows/04_gcp_kv.yaml
Normal file
37
02-workflow-orchestration/flows/04_gcp_kv.yaml
Normal file
@ -0,0 +1,37 @@
|
||||
id: 04_gcp_kv
|
||||
namespace: zoomcamp
|
||||
|
||||
tasks:
|
||||
- id: gcp_creds
|
||||
type: io.kestra.plugin.core.kv.Set
|
||||
key: GCP_CREDS
|
||||
kvType: JSON
|
||||
value: |
|
||||
{
|
||||
"type": "service_account",
|
||||
"project_id": "...",
|
||||
}
|
||||
|
||||
- id: gcp_project_id
|
||||
type: io.kestra.plugin.core.kv.Set
|
||||
key: GCP_PROJECT_ID
|
||||
kvType: STRING
|
||||
value: kestra-sandbox # TODO replace with your project id
|
||||
|
||||
- id: gcp_location
|
||||
type: io.kestra.plugin.core.kv.Set
|
||||
key: GCP_LOCATION
|
||||
kvType: STRING
|
||||
value: europe-west2
|
||||
|
||||
- id: gcp_bucket_name
|
||||
type: io.kestra.plugin.core.kv.Set
|
||||
key: GCP_BUCKET_NAME
|
||||
kvType: STRING
|
||||
value: your-name-kestra # TODO make sure it's globally unique!
|
||||
|
||||
- id: gcp_dataset
|
||||
type: io.kestra.plugin.core.kv.Set
|
||||
key: GCP_DATASET
|
||||
kvType: STRING
|
||||
value: zoomcamp
|
||||
22
02-workflow-orchestration/flows/05_gcp_setup.yaml
Normal file
22
02-workflow-orchestration/flows/05_gcp_setup.yaml
Normal file
@ -0,0 +1,22 @@
|
||||
id: 05_gcp_setup
|
||||
namespace: zoomcamp
|
||||
|
||||
tasks:
|
||||
- id: create_gcs_bucket
|
||||
type: io.kestra.plugin.gcp.gcs.CreateBucket
|
||||
ifExists: SKIP
|
||||
storageClass: REGIONAL
|
||||
name: "{{kv('GCP_BUCKET_NAME')}}" # make sure it's globally unique!
|
||||
|
||||
- id: create_bq_dataset
|
||||
type: io.kestra.plugin.gcp.bigquery.CreateDataset
|
||||
name: "{{kv('GCP_DATASET')}}"
|
||||
ifExists: SKIP
|
||||
|
||||
pluginDefaults:
|
||||
- type: io.kestra.plugin.gcp
|
||||
values:
|
||||
serviceAccount: "{{kv('GCP_CREDS')}}"
|
||||
projectId: "{{kv('GCP_PROJECT_ID')}}"
|
||||
location: "{{kv('GCP_LOCATION')}}"
|
||||
bucket: "{{kv('GCP_BUCKET_NAME')}}"
|
||||
248
02-workflow-orchestration/flows/06_gcp_taxi.yaml
Normal file
248
02-workflow-orchestration/flows/06_gcp_taxi.yaml
Normal file
@ -0,0 +1,248 @@
|
||||
id: 06_gcp_taxi
|
||||
namespace: zoomcamp
|
||||
description: |
|
||||
The CSV Data used in the course: https://github.com/DataTalksClub/nyc-tlc-data/releases
|
||||
|
||||
inputs:
|
||||
- id: taxi
|
||||
type: SELECT
|
||||
displayName: Select taxi type
|
||||
values: [yellow, green]
|
||||
defaults: green
|
||||
|
||||
- id: year
|
||||
type: SELECT
|
||||
displayName: Select year
|
||||
values: ["2019", "2020"]
|
||||
defaults: "2019"
|
||||
allowCustomValue: true # allows you to type 2021 from the UI for the homework 🤗
|
||||
|
||||
- id: month
|
||||
type: SELECT
|
||||
displayName: Select month
|
||||
values: ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
|
||||
defaults: "01"
|
||||
|
||||
variables:
|
||||
file: "{{inputs.taxi}}_tripdata_{{inputs.year}}-{{inputs.month}}.csv"
|
||||
gcs_file: "gs://{{kv('GCP_BUCKET_NAME')}}/{{vars.file}}"
|
||||
table: "{{kv('GCP_DATASET')}}.{{inputs.taxi}}_tripdata_{{inputs.year}}_{{inputs.month}}"
|
||||
data: "{{outputs.extract.outputFiles[inputs.taxi ~ '_tripdata_' ~ inputs.year ~ '-' ~ inputs.month ~ '.csv']}}"
|
||||
|
||||
tasks:
|
||||
- id: set_label
|
||||
type: io.kestra.plugin.core.execution.Labels
|
||||
labels:
|
||||
file: "{{render(vars.file)}}"
|
||||
taxi: "{{inputs.taxi}}"
|
||||
|
||||
- id: extract
|
||||
type: io.kestra.plugin.scripts.shell.Commands
|
||||
outputFiles:
|
||||
- "*.csv"
|
||||
taskRunner:
|
||||
type: io.kestra.plugin.core.runner.Process
|
||||
commands:
|
||||
- wget -qO- https://github.com/DataTalksClub/nyc-tlc-data/releases/download/{{inputs.taxi}}/{{render(vars.file)}}.gz | gunzip > {{render(vars.file)}}
|
||||
|
||||
- id: upload_to_gcs
|
||||
type: io.kestra.plugin.gcp.gcs.Upload
|
||||
from: "{{render(vars.data)}}"
|
||||
to: "{{render(vars.gcs_file)}}"
|
||||
|
||||
- id: if_yellow_taxi
|
||||
type: io.kestra.plugin.core.flow.If
|
||||
condition: "{{inputs.taxi == 'yellow'}}"
|
||||
then:
|
||||
- id: bq_yellow_tripdata
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS `{{kv('GCP_PROJECT_ID')}}.{{kv('GCP_DATASET')}}.yellow_tripdata`
|
||||
(
|
||||
unique_row_id BYTES OPTIONS (description = 'A unique identifier for the trip, generated by hashing key trip attributes.'),
|
||||
filename STRING OPTIONS (description = 'The source filename from which the trip data was loaded.'),
|
||||
VendorID STRING OPTIONS (description = 'A code indicating the LPEP provider that provided the record. 1= Creative Mobile Technologies, LLC; 2= VeriFone Inc.'),
|
||||
tpep_pickup_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was engaged'),
|
||||
tpep_dropoff_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was disengaged'),
|
||||
passenger_count INTEGER OPTIONS (description = 'The number of passengers in the vehicle. This is a driver-entered value.'),
|
||||
trip_distance NUMERIC OPTIONS (description = 'The elapsed trip distance in miles reported by the taximeter.'),
|
||||
RatecodeID STRING OPTIONS (description = 'The final rate code in effect at the end of the trip. 1= Standard rate 2=JFK 3=Newark 4=Nassau or Westchester 5=Negotiated fare 6=Group ride'),
|
||||
store_and_fwd_flag STRING OPTIONS (description = 'This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka "store and forward," because the vehicle did not have a connection to the server. TRUE = store and forward trip, FALSE = not a store and forward trip'),
|
||||
PULocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was engaged'),
|
||||
DOLocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was disengaged'),
|
||||
payment_type INTEGER OPTIONS (description = 'A numeric code signifying how the passenger paid for the trip. 1= Credit card 2= Cash 3= No charge 4= Dispute 5= Unknown 6= Voided trip'),
|
||||
fare_amount NUMERIC OPTIONS (description = 'The time-and-distance fare calculated by the meter'),
|
||||
extra NUMERIC OPTIONS (description = 'Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges'),
|
||||
mta_tax NUMERIC OPTIONS (description = '$0.50 MTA tax that is automatically triggered based on the metered rate in use'),
|
||||
tip_amount NUMERIC OPTIONS (description = 'Tip amount. This field is automatically populated for credit card tips. Cash tips are not included.'),
|
||||
tolls_amount NUMERIC OPTIONS (description = 'Total amount of all tolls paid in trip.'),
|
||||
improvement_surcharge NUMERIC OPTIONS (description = '$0.30 improvement surcharge assessed on hailed trips at the flag drop. The improvement surcharge began being levied in 2015.'),
|
||||
total_amount NUMERIC OPTIONS (description = 'The total amount charged to passengers. Does not include cash tips.'),
|
||||
congestion_surcharge NUMERIC OPTIONS (description = 'Congestion surcharge applied to trips in congested zones')
|
||||
)
|
||||
PARTITION BY DATE(tpep_pickup_datetime);
|
||||
|
||||
- id: bq_yellow_table_ext
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE OR REPLACE EXTERNAL TABLE `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}_ext`
|
||||
(
|
||||
VendorID STRING OPTIONS (description = 'A code indicating the LPEP provider that provided the record. 1= Creative Mobile Technologies, LLC; 2= VeriFone Inc.'),
|
||||
tpep_pickup_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was engaged'),
|
||||
tpep_dropoff_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was disengaged'),
|
||||
passenger_count INTEGER OPTIONS (description = 'The number of passengers in the vehicle. This is a driver-entered value.'),
|
||||
trip_distance NUMERIC OPTIONS (description = 'The elapsed trip distance in miles reported by the taximeter.'),
|
||||
RatecodeID STRING OPTIONS (description = 'The final rate code in effect at the end of the trip. 1= Standard rate 2=JFK 3=Newark 4=Nassau or Westchester 5=Negotiated fare 6=Group ride'),
|
||||
store_and_fwd_flag STRING OPTIONS (description = 'This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka "store and forward," because the vehicle did not have a connection to the server. TRUE = store and forward trip, FALSE = not a store and forward trip'),
|
||||
PULocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was engaged'),
|
||||
DOLocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was disengaged'),
|
||||
payment_type INTEGER OPTIONS (description = 'A numeric code signifying how the passenger paid for the trip. 1= Credit card 2= Cash 3= No charge 4= Dispute 5= Unknown 6= Voided trip'),
|
||||
fare_amount NUMERIC OPTIONS (description = 'The time-and-distance fare calculated by the meter'),
|
||||
extra NUMERIC OPTIONS (description = 'Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges'),
|
||||
mta_tax NUMERIC OPTIONS (description = '$0.50 MTA tax that is automatically triggered based on the metered rate in use'),
|
||||
tip_amount NUMERIC OPTIONS (description = 'Tip amount. This field is automatically populated for credit card tips. Cash tips are not included.'),
|
||||
tolls_amount NUMERIC OPTIONS (description = 'Total amount of all tolls paid in trip.'),
|
||||
improvement_surcharge NUMERIC OPTIONS (description = '$0.30 improvement surcharge assessed on hailed trips at the flag drop. The improvement surcharge began being levied in 2015.'),
|
||||
total_amount NUMERIC OPTIONS (description = 'The total amount charged to passengers. Does not include cash tips.'),
|
||||
congestion_surcharge NUMERIC OPTIONS (description = 'Congestion surcharge applied to trips in congested zones')
|
||||
)
|
||||
OPTIONS (
|
||||
format = 'CSV',
|
||||
uris = ['{{render(vars.gcs_file)}}'],
|
||||
skip_leading_rows = 1,
|
||||
ignore_unknown_values = TRUE
|
||||
);
|
||||
|
||||
- id: bq_yellow_table_tmp
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE OR REPLACE TABLE `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}`
|
||||
AS
|
||||
SELECT
|
||||
MD5(CONCAT(
|
||||
COALESCE(CAST(VendorID AS STRING), ""),
|
||||
COALESCE(CAST(tpep_pickup_datetime AS STRING), ""),
|
||||
COALESCE(CAST(tpep_dropoff_datetime AS STRING), ""),
|
||||
COALESCE(CAST(PULocationID AS STRING), ""),
|
||||
COALESCE(CAST(DOLocationID AS STRING), "")
|
||||
)) AS unique_row_id,
|
||||
"{{render(vars.file)}}" AS filename,
|
||||
*
|
||||
FROM `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}_ext`;
|
||||
|
||||
- id: bq_yellow_merge
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
MERGE INTO `{{kv('GCP_PROJECT_ID')}}.{{kv('GCP_DATASET')}}.yellow_tripdata` T
|
||||
USING `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}` S
|
||||
ON T.unique_row_id = S.unique_row_id
|
||||
WHEN NOT MATCHED THEN
|
||||
INSERT (unique_row_id, filename, VendorID, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, RatecodeID, store_and_fwd_flag, PULocationID, DOLocationID, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, congestion_surcharge)
|
||||
VALUES (S.unique_row_id, S.filename, S.VendorID, S.tpep_pickup_datetime, S.tpep_dropoff_datetime, S.passenger_count, S.trip_distance, S.RatecodeID, S.store_and_fwd_flag, S.PULocationID, S.DOLocationID, S.payment_type, S.fare_amount, S.extra, S.mta_tax, S.tip_amount, S.tolls_amount, S.improvement_surcharge, S.total_amount, S.congestion_surcharge);
|
||||
|
||||
- id: if_green_taxi
|
||||
type: io.kestra.plugin.core.flow.If
|
||||
condition: "{{inputs.taxi == 'green'}}"
|
||||
then:
|
||||
- id: bq_green_tripdata
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS `{{kv('GCP_PROJECT_ID')}}.{{kv('GCP_DATASET')}}.green_tripdata`
|
||||
(
|
||||
unique_row_id BYTES OPTIONS (description = 'A unique identifier for the trip, generated by hashing key trip attributes.'),
|
||||
filename STRING OPTIONS (description = 'The source filename from which the trip data was loaded.'),
|
||||
VendorID STRING OPTIONS (description = 'A code indicating the LPEP provider that provided the record. 1= Creative Mobile Technologies, LLC; 2= VeriFone Inc.'),
|
||||
lpep_pickup_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was engaged'),
|
||||
lpep_dropoff_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was disengaged'),
|
||||
store_and_fwd_flag STRING OPTIONS (description = 'This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka "store and forward," because the vehicle did not have a connection to the server. Y= store and forward trip N= not a store and forward trip'),
|
||||
RatecodeID STRING OPTIONS (description = 'The final rate code in effect at the end of the trip. 1= Standard rate 2=JFK 3=Newark 4=Nassau or Westchester 5=Negotiated fare 6=Group ride'),
|
||||
PULocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was engaged'),
|
||||
DOLocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was disengaged'),
|
||||
passenger_count INT64 OPTIONS (description = 'The number of passengers in the vehicle. This is a driver-entered value.'),
|
||||
trip_distance NUMERIC OPTIONS (description = 'The elapsed trip distance in miles reported by the taximeter.'),
|
||||
fare_amount NUMERIC OPTIONS (description = 'The time-and-distance fare calculated by the meter'),
|
||||
extra NUMERIC OPTIONS (description = 'Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges'),
|
||||
mta_tax NUMERIC OPTIONS (description = '$0.50 MTA tax that is automatically triggered based on the metered rate in use'),
|
||||
tip_amount NUMERIC OPTIONS (description = 'Tip amount. This field is automatically populated for credit card tips. Cash tips are not included.'),
|
||||
tolls_amount NUMERIC OPTIONS (description = 'Total amount of all tolls paid in trip.'),
|
||||
ehail_fee NUMERIC,
|
||||
improvement_surcharge NUMERIC OPTIONS (description = '$0.30 improvement surcharge assessed on hailed trips at the flag drop. The improvement surcharge began being levied in 2015.'),
|
||||
total_amount NUMERIC OPTIONS (description = 'The total amount charged to passengers. Does not include cash tips.'),
|
||||
payment_type INTEGER OPTIONS (description = 'A numeric code signifying how the passenger paid for the trip. 1= Credit card 2= Cash 3= No charge 4= Dispute 5= Unknown 6= Voided trip'),
|
||||
trip_type STRING OPTIONS (description = 'A code indicating whether the trip was a street-hail or a dispatch that is automatically assigned based on the metered rate in use but can be altered by the driver. 1= Street-hail 2= Dispatch'),
|
||||
congestion_surcharge NUMERIC OPTIONS (description = 'Congestion surcharge applied to trips in congested zones')
|
||||
)
|
||||
PARTITION BY DATE(lpep_pickup_datetime);
|
||||
|
||||
- id: bq_green_table_ext
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE OR REPLACE EXTERNAL TABLE `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}_ext`
|
||||
(
|
||||
VendorID STRING OPTIONS (description = 'A code indicating the LPEP provider that provided the record. 1= Creative Mobile Technologies, LLC; 2= VeriFone Inc.'),
|
||||
lpep_pickup_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was engaged'),
|
||||
lpep_dropoff_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was disengaged'),
|
||||
store_and_fwd_flag STRING OPTIONS (description = 'This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka "store and forward," because the vehicle did not have a connection to the server. Y= store and forward trip N= not a store and forward trip'),
|
||||
RatecodeID STRING OPTIONS (description = 'The final rate code in effect at the end of the trip. 1= Standard rate 2=JFK 3=Newark 4=Nassau or Westchester 5=Negotiated fare 6=Group ride'),
|
||||
PULocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was engaged'),
|
||||
DOLocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was disengaged'),
|
||||
passenger_count INT64 OPTIONS (description = 'The number of passengers in the vehicle. This is a driver-entered value.'),
|
||||
trip_distance NUMERIC OPTIONS (description = 'The elapsed trip distance in miles reported by the taximeter.'),
|
||||
fare_amount NUMERIC OPTIONS (description = 'The time-and-distance fare calculated by the meter'),
|
||||
extra NUMERIC OPTIONS (description = 'Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges'),
|
||||
mta_tax NUMERIC OPTIONS (description = '$0.50 MTA tax that is automatically triggered based on the metered rate in use'),
|
||||
tip_amount NUMERIC OPTIONS (description = 'Tip amount. This field is automatically populated for credit card tips. Cash tips are not included.'),
|
||||
tolls_amount NUMERIC OPTIONS (description = 'Total amount of all tolls paid in trip.'),
|
||||
ehail_fee NUMERIC,
|
||||
improvement_surcharge NUMERIC OPTIONS (description = '$0.30 improvement surcharge assessed on hailed trips at the flag drop. The improvement surcharge began being levied in 2015.'),
|
||||
total_amount NUMERIC OPTIONS (description = 'The total amount charged to passengers. Does not include cash tips.'),
|
||||
payment_type INTEGER OPTIONS (description = 'A numeric code signifying how the passenger paid for the trip. 1= Credit card 2= Cash 3= No charge 4= Dispute 5= Unknown 6= Voided trip'),
|
||||
trip_type STRING OPTIONS (description = 'A code indicating whether the trip was a street-hail or a dispatch that is automatically assigned based on the metered rate in use but can be altered by the driver. 1= Street-hail 2= Dispatch'),
|
||||
congestion_surcharge NUMERIC OPTIONS (description = 'Congestion surcharge applied to trips in congested zones')
|
||||
)
|
||||
OPTIONS (
|
||||
format = 'CSV',
|
||||
uris = ['{{render(vars.gcs_file)}}'],
|
||||
skip_leading_rows = 1,
|
||||
ignore_unknown_values = TRUE
|
||||
);
|
||||
|
||||
- id: bq_green_table_tmp
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE OR REPLACE TABLE `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}`
|
||||
AS
|
||||
SELECT
|
||||
MD5(CONCAT(
|
||||
COALESCE(CAST(VendorID AS STRING), ""),
|
||||
COALESCE(CAST(lpep_pickup_datetime AS STRING), ""),
|
||||
COALESCE(CAST(lpep_dropoff_datetime AS STRING), ""),
|
||||
COALESCE(CAST(PULocationID AS STRING), ""),
|
||||
COALESCE(CAST(DOLocationID AS STRING), "")
|
||||
)) AS unique_row_id,
|
||||
"{{render(vars.file)}}" AS filename,
|
||||
*
|
||||
FROM `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}_ext`;
|
||||
|
||||
- id: bq_green_merge
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
MERGE INTO `{{kv('GCP_PROJECT_ID')}}.{{kv('GCP_DATASET')}}.green_tripdata` T
|
||||
USING `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}` S
|
||||
ON T.unique_row_id = S.unique_row_id
|
||||
WHEN NOT MATCHED THEN
|
||||
INSERT (unique_row_id, filename, VendorID, lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, RatecodeID, PULocationID, DOLocationID, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee, improvement_surcharge, total_amount, payment_type, trip_type, congestion_surcharge)
|
||||
VALUES (S.unique_row_id, S.filename, S.VendorID, S.lpep_pickup_datetime, S.lpep_dropoff_datetime, S.store_and_fwd_flag, S.RatecodeID, S.PULocationID, S.DOLocationID, S.passenger_count, S.trip_distance, S.fare_amount, S.extra, S.mta_tax, S.tip_amount, S.tolls_amount, S.ehail_fee, S.improvement_surcharge, S.total_amount, S.payment_type, S.trip_type, S.congestion_surcharge);
|
||||
|
||||
- id: purge_files
|
||||
type: io.kestra.plugin.core.storage.PurgeCurrentExecutionFiles
|
||||
description: If you'd like to explore Kestra outputs, disable it.
|
||||
disabled: false
|
||||
|
||||
pluginDefaults:
|
||||
- type: io.kestra.plugin.gcp
|
||||
values:
|
||||
serviceAccount: "{{kv('GCP_CREDS')}}"
|
||||
projectId: "{{kv('GCP_PROJECT_ID')}}"
|
||||
location: "{{kv('GCP_LOCATION')}}"
|
||||
bucket: "{{kv('GCP_BUCKET_NAME')}}"
|
||||
249
02-workflow-orchestration/flows/06_gcp_taxi_scheduled.yaml
Normal file
249
02-workflow-orchestration/flows/06_gcp_taxi_scheduled.yaml
Normal file
@ -0,0 +1,249 @@
|
||||
|
||||
id: 06_gcp_taxi_scheduled
|
||||
namespace: zoomcamp
|
||||
description: |
|
||||
Best to add a label `backfill:true` from the UI to track executions created via a backfill.
|
||||
CSV data used here comes from: https://github.com/DataTalksClub/nyc-tlc-data/releases
|
||||
|
||||
inputs:
|
||||
- id: taxi
|
||||
type: SELECT
|
||||
displayName: Select taxi type
|
||||
values: [yellow, green]
|
||||
defaults: green
|
||||
|
||||
variables:
|
||||
file: "{{inputs.taxi}}_tripdata_{{trigger.date | date('yyyy-MM')}}.csv"
|
||||
gcs_file: "gs://{{kv('GCP_BUCKET_NAME')}}/{{vars.file}}"
|
||||
table: "{{kv('GCP_DATASET')}}.{{inputs.taxi}}_tripdata_{{trigger.date | date('yyyy_MM')}}"
|
||||
data: "{{outputs.extract.outputFiles[inputs.taxi ~ '_tripdata_' ~ (trigger.date | date('yyyy-MM')) ~ '.csv']}}"
|
||||
|
||||
tasks:
|
||||
- id: set_label
|
||||
type: io.kestra.plugin.core.execution.Labels
|
||||
labels:
|
||||
file: "{{render(vars.file)}}"
|
||||
taxi: "{{inputs.taxi}}"
|
||||
|
||||
- id: extract
|
||||
type: io.kestra.plugin.scripts.shell.Commands
|
||||
outputFiles:
|
||||
- "*.csv"
|
||||
taskRunner:
|
||||
type: io.kestra.plugin.core.runner.Process
|
||||
commands:
|
||||
- wget -qO- https://github.com/DataTalksClub/nyc-tlc-data/releases/download/{{inputs.taxi}}/{{render(vars.file)}}.gz | gunzip > {{render(vars.file)}}
|
||||
|
||||
- id: upload_to_gcs
|
||||
type: io.kestra.plugin.gcp.gcs.Upload
|
||||
from: "{{render(vars.data)}}"
|
||||
to: "{{render(vars.gcs_file)}}"
|
||||
|
||||
- id: if_yellow_taxi
|
||||
type: io.kestra.plugin.core.flow.If
|
||||
condition: "{{inputs.taxi == 'yellow'}}"
|
||||
then:
|
||||
- id: bq_yellow_tripdata
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS `{{kv('GCP_PROJECT_ID')}}.{{kv('GCP_DATASET')}}.yellow_tripdata`
|
||||
(
|
||||
unique_row_id BYTES OPTIONS (description = 'A unique identifier for the trip, generated by hashing key trip attributes.'),
|
||||
filename STRING OPTIONS (description = 'The source filename from which the trip data was loaded.'),
|
||||
VendorID STRING OPTIONS (description = 'A code indicating the LPEP provider that provided the record. 1= Creative Mobile Technologies, LLC; 2= VeriFone Inc.'),
|
||||
tpep_pickup_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was engaged'),
|
||||
tpep_dropoff_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was disengaged'),
|
||||
passenger_count INTEGER OPTIONS (description = 'The number of passengers in the vehicle. This is a driver-entered value.'),
|
||||
trip_distance NUMERIC OPTIONS (description = 'The elapsed trip distance in miles reported by the taximeter.'),
|
||||
RatecodeID STRING OPTIONS (description = 'The final rate code in effect at the end of the trip. 1= Standard rate 2=JFK 3=Newark 4=Nassau or Westchester 5=Negotiated fare 6=Group ride'),
|
||||
store_and_fwd_flag STRING OPTIONS (description = 'This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka "store and forward," because the vehicle did not have a connection to the server. TRUE = store and forward trip, FALSE = not a store and forward trip'),
|
||||
PULocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was engaged'),
|
||||
DOLocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was disengaged'),
|
||||
payment_type INTEGER OPTIONS (description = 'A numeric code signifying how the passenger paid for the trip. 1= Credit card 2= Cash 3= No charge 4= Dispute 5= Unknown 6= Voided trip'),
|
||||
fare_amount NUMERIC OPTIONS (description = 'The time-and-distance fare calculated by the meter'),
|
||||
extra NUMERIC OPTIONS (description = 'Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges'),
|
||||
mta_tax NUMERIC OPTIONS (description = '$0.50 MTA tax that is automatically triggered based on the metered rate in use'),
|
||||
tip_amount NUMERIC OPTIONS (description = 'Tip amount. This field is automatically populated for credit card tips. Cash tips are not included.'),
|
||||
tolls_amount NUMERIC OPTIONS (description = 'Total amount of all tolls paid in trip.'),
|
||||
improvement_surcharge NUMERIC OPTIONS (description = '$0.30 improvement surcharge assessed on hailed trips at the flag drop. The improvement surcharge began being levied in 2015.'),
|
||||
total_amount NUMERIC OPTIONS (description = 'The total amount charged to passengers. Does not include cash tips.'),
|
||||
congestion_surcharge NUMERIC OPTIONS (description = 'Congestion surcharge applied to trips in congested zones')
|
||||
)
|
||||
PARTITION BY DATE(tpep_pickup_datetime);
|
||||
|
||||
- id: bq_yellow_table_ext
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE OR REPLACE EXTERNAL TABLE `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}_ext`
|
||||
(
|
||||
VendorID STRING OPTIONS (description = 'A code indicating the LPEP provider that provided the record. 1= Creative Mobile Technologies, LLC; 2= VeriFone Inc.'),
|
||||
tpep_pickup_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was engaged'),
|
||||
tpep_dropoff_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was disengaged'),
|
||||
passenger_count INTEGER OPTIONS (description = 'The number of passengers in the vehicle. This is a driver-entered value.'),
|
||||
trip_distance NUMERIC OPTIONS (description = 'The elapsed trip distance in miles reported by the taximeter.'),
|
||||
RatecodeID STRING OPTIONS (description = 'The final rate code in effect at the end of the trip. 1= Standard rate 2=JFK 3=Newark 4=Nassau or Westchester 5=Negotiated fare 6=Group ride'),
|
||||
store_and_fwd_flag STRING OPTIONS (description = 'This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka "store and forward," because the vehicle did not have a connection to the server. TRUE = store and forward trip, FALSE = not a store and forward trip'),
|
||||
PULocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was engaged'),
|
||||
DOLocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was disengaged'),
|
||||
payment_type INTEGER OPTIONS (description = 'A numeric code signifying how the passenger paid for the trip. 1= Credit card 2= Cash 3= No charge 4= Dispute 5= Unknown 6= Voided trip'),
|
||||
fare_amount NUMERIC OPTIONS (description = 'The time-and-distance fare calculated by the meter'),
|
||||
extra NUMERIC OPTIONS (description = 'Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges'),
|
||||
mta_tax NUMERIC OPTIONS (description = '$0.50 MTA tax that is automatically triggered based on the metered rate in use'),
|
||||
tip_amount NUMERIC OPTIONS (description = 'Tip amount. This field is automatically populated for credit card tips. Cash tips are not included.'),
|
||||
tolls_amount NUMERIC OPTIONS (description = 'Total amount of all tolls paid in trip.'),
|
||||
improvement_surcharge NUMERIC OPTIONS (description = '$0.30 improvement surcharge assessed on hailed trips at the flag drop. The improvement surcharge began being levied in 2015.'),
|
||||
total_amount NUMERIC OPTIONS (description = 'The total amount charged to passengers. Does not include cash tips.'),
|
||||
congestion_surcharge NUMERIC OPTIONS (description = 'Congestion surcharge applied to trips in congested zones')
|
||||
)
|
||||
OPTIONS (
|
||||
format = 'CSV',
|
||||
uris = ['{{render(vars.gcs_file)}}'],
|
||||
skip_leading_rows = 1,
|
||||
ignore_unknown_values = TRUE
|
||||
);
|
||||
|
||||
- id: bq_yellow_table_tmp
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE OR REPLACE TABLE `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}`
|
||||
AS
|
||||
SELECT
|
||||
MD5(CONCAT(
|
||||
COALESCE(CAST(VendorID AS STRING), ""),
|
||||
COALESCE(CAST(tpep_pickup_datetime AS STRING), ""),
|
||||
COALESCE(CAST(tpep_dropoff_datetime AS STRING), ""),
|
||||
COALESCE(CAST(PULocationID AS STRING), ""),
|
||||
COALESCE(CAST(DOLocationID AS STRING), "")
|
||||
)) AS unique_row_id,
|
||||
"{{render(vars.file)}}" AS filename,
|
||||
*
|
||||
FROM `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}_ext`;
|
||||
|
||||
- id: bq_yellow_merge
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
MERGE INTO `{{kv('GCP_PROJECT_ID')}}.{{kv('GCP_DATASET')}}.yellow_tripdata` T
|
||||
USING `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}` S
|
||||
ON T.unique_row_id = S.unique_row_id
|
||||
WHEN NOT MATCHED THEN
|
||||
INSERT (unique_row_id, filename, VendorID, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, RatecodeID, store_and_fwd_flag, PULocationID, DOLocationID, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, congestion_surcharge)
|
||||
VALUES (S.unique_row_id, S.filename, S.VendorID, S.tpep_pickup_datetime, S.tpep_dropoff_datetime, S.passenger_count, S.trip_distance, S.RatecodeID, S.store_and_fwd_flag, S.PULocationID, S.DOLocationID, S.payment_type, S.fare_amount, S.extra, S.mta_tax, S.tip_amount, S.tolls_amount, S.improvement_surcharge, S.total_amount, S.congestion_surcharge);
|
||||
|
||||
- id: if_green_taxi
|
||||
type: io.kestra.plugin.core.flow.If
|
||||
condition: "{{inputs.taxi == 'green'}}"
|
||||
then:
|
||||
- id: bq_green_tripdata
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS `{{kv('GCP_PROJECT_ID')}}.{{kv('GCP_DATASET')}}.green_tripdata`
|
||||
(
|
||||
unique_row_id BYTES OPTIONS (description = 'A unique identifier for the trip, generated by hashing key trip attributes.'),
|
||||
filename STRING OPTIONS (description = 'The source filename from which the trip data was loaded.'),
|
||||
VendorID STRING OPTIONS (description = 'A code indicating the LPEP provider that provided the record. 1= Creative Mobile Technologies, LLC; 2= VeriFone Inc.'),
|
||||
lpep_pickup_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was engaged'),
|
||||
lpep_dropoff_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was disengaged'),
|
||||
store_and_fwd_flag STRING OPTIONS (description = 'This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka "store and forward," because the vehicle did not have a connection to the server. Y= store and forward trip N= not a store and forward trip'),
|
||||
RatecodeID STRING OPTIONS (description = 'The final rate code in effect at the end of the trip. 1= Standard rate 2=JFK 3=Newark 4=Nassau or Westchester 5=Negotiated fare 6=Group ride'),
|
||||
PULocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was engaged'),
|
||||
DOLocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was disengaged'),
|
||||
passenger_count INT64 OPTIONS (description = 'The number of passengers in the vehicle. This is a driver-entered value.'),
|
||||
trip_distance NUMERIC OPTIONS (description = 'The elapsed trip distance in miles reported by the taximeter.'),
|
||||
fare_amount NUMERIC OPTIONS (description = 'The time-and-distance fare calculated by the meter'),
|
||||
extra NUMERIC OPTIONS (description = 'Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges'),
|
||||
mta_tax NUMERIC OPTIONS (description = '$0.50 MTA tax that is automatically triggered based on the metered rate in use'),
|
||||
tip_amount NUMERIC OPTIONS (description = 'Tip amount. This field is automatically populated for credit card tips. Cash tips are not included.'),
|
||||
tolls_amount NUMERIC OPTIONS (description = 'Total amount of all tolls paid in trip.'),
|
||||
ehail_fee NUMERIC,
|
||||
improvement_surcharge NUMERIC OPTIONS (description = '$0.30 improvement surcharge assessed on hailed trips at the flag drop. The improvement surcharge began being levied in 2015.'),
|
||||
total_amount NUMERIC OPTIONS (description = 'The total amount charged to passengers. Does not include cash tips.'),
|
||||
payment_type INTEGER OPTIONS (description = 'A numeric code signifying how the passenger paid for the trip. 1= Credit card 2= Cash 3= No charge 4= Dispute 5= Unknown 6= Voided trip'),
|
||||
trip_type STRING OPTIONS (description = 'A code indicating whether the trip was a street-hail or a dispatch that is automatically assigned based on the metered rate in use but can be altered by the driver. 1= Street-hail 2= Dispatch'),
|
||||
congestion_surcharge NUMERIC OPTIONS (description = 'Congestion surcharge applied to trips in congested zones')
|
||||
)
|
||||
PARTITION BY DATE(lpep_pickup_datetime);
|
||||
|
||||
- id: bq_green_table_ext
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE OR REPLACE EXTERNAL TABLE `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}_ext`
|
||||
(
|
||||
VendorID STRING OPTIONS (description = 'A code indicating the LPEP provider that provided the record. 1= Creative Mobile Technologies, LLC; 2= VeriFone Inc.'),
|
||||
lpep_pickup_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was engaged'),
|
||||
lpep_dropoff_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was disengaged'),
|
||||
store_and_fwd_flag STRING OPTIONS (description = 'This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka "store and forward," because the vehicle did not have a connection to the server. Y= store and forward trip N= not a store and forward trip'),
|
||||
RatecodeID STRING OPTIONS (description = 'The final rate code in effect at the end of the trip. 1= Standard rate 2=JFK 3=Newark 4=Nassau or Westchester 5=Negotiated fare 6=Group ride'),
|
||||
PULocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was engaged'),
|
||||
DOLocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was disengaged'),
|
||||
passenger_count INT64 OPTIONS (description = 'The number of passengers in the vehicle. This is a driver-entered value.'),
|
||||
trip_distance NUMERIC OPTIONS (description = 'The elapsed trip distance in miles reported by the taximeter.'),
|
||||
fare_amount NUMERIC OPTIONS (description = 'The time-and-distance fare calculated by the meter'),
|
||||
extra NUMERIC OPTIONS (description = 'Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges'),
|
||||
mta_tax NUMERIC OPTIONS (description = '$0.50 MTA tax that is automatically triggered based on the metered rate in use'),
|
||||
tip_amount NUMERIC OPTIONS (description = 'Tip amount. This field is automatically populated for credit card tips. Cash tips are not included.'),
|
||||
tolls_amount NUMERIC OPTIONS (description = 'Total amount of all tolls paid in trip.'),
|
||||
ehail_fee NUMERIC,
|
||||
improvement_surcharge NUMERIC OPTIONS (description = '$0.30 improvement surcharge assessed on hailed trips at the flag drop. The improvement surcharge began being levied in 2015.'),
|
||||
total_amount NUMERIC OPTIONS (description = 'The total amount charged to passengers. Does not include cash tips.'),
|
||||
payment_type INTEGER OPTIONS (description = 'A numeric code signifying how the passenger paid for the trip. 1= Credit card 2= Cash 3= No charge 4= Dispute 5= Unknown 6= Voided trip'),
|
||||
trip_type STRING OPTIONS (description = 'A code indicating whether the trip was a street-hail or a dispatch that is automatically assigned based on the metered rate in use but can be altered by the driver. 1= Street-hail 2= Dispatch'),
|
||||
congestion_surcharge NUMERIC OPTIONS (description = 'Congestion surcharge applied to trips in congested zones')
|
||||
)
|
||||
OPTIONS (
|
||||
format = 'CSV',
|
||||
uris = ['{{render(vars.gcs_file)}}'],
|
||||
skip_leading_rows = 1,
|
||||
ignore_unknown_values = TRUE
|
||||
);
|
||||
|
||||
- id: bq_green_table_tmp
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE OR REPLACE TABLE `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}`
|
||||
AS
|
||||
SELECT
|
||||
MD5(CONCAT(
|
||||
COALESCE(CAST(VendorID AS STRING), ""),
|
||||
COALESCE(CAST(lpep_pickup_datetime AS STRING), ""),
|
||||
COALESCE(CAST(lpep_dropoff_datetime AS STRING), ""),
|
||||
COALESCE(CAST(PULocationID AS STRING), ""),
|
||||
COALESCE(CAST(DOLocationID AS STRING), "")
|
||||
)) AS unique_row_id,
|
||||
"{{render(vars.file)}}" AS filename,
|
||||
*
|
||||
FROM `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}_ext`;
|
||||
|
||||
- id: bq_green_merge
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
MERGE INTO `{{kv('GCP_PROJECT_ID')}}.{{kv('GCP_DATASET')}}.green_tripdata` T
|
||||
USING `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}` S
|
||||
ON T.unique_row_id = S.unique_row_id
|
||||
WHEN NOT MATCHED THEN
|
||||
INSERT (unique_row_id, filename, VendorID, lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, RatecodeID, PULocationID, DOLocationID, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee, improvement_surcharge, total_amount, payment_type, trip_type, congestion_surcharge)
|
||||
VALUES (S.unique_row_id, S.filename, S.VendorID, S.lpep_pickup_datetime, S.lpep_dropoff_datetime, S.store_and_fwd_flag, S.RatecodeID, S.PULocationID, S.DOLocationID, S.passenger_count, S.trip_distance, S.fare_amount, S.extra, S.mta_tax, S.tip_amount, S.tolls_amount, S.ehail_fee, S.improvement_surcharge, S.total_amount, S.payment_type, S.trip_type, S.congestion_surcharge);
|
||||
|
||||
- id: purge_files
|
||||
type: io.kestra.plugin.core.storage.PurgeCurrentExecutionFiles
|
||||
description: To avoid cluttering your storage, we will remove the downloaded files
|
||||
|
||||
pluginDefaults:
|
||||
- type: io.kestra.plugin.gcp
|
||||
values:
|
||||
serviceAccount: "{{kv('GCP_CREDS')}}"
|
||||
projectId: "{{kv('GCP_PROJECT_ID')}}"
|
||||
location: "{{kv('GCP_LOCATION')}}"
|
||||
bucket: "{{kv('GCP_BUCKET_NAME')}}"
|
||||
|
||||
triggers:
|
||||
- id: green_schedule
|
||||
type: io.kestra.plugin.core.trigger.Schedule
|
||||
cron: "0 9 1 * *"
|
||||
inputs:
|
||||
taxi: green
|
||||
|
||||
- id: yellow_schedule
|
||||
type: io.kestra.plugin.core.trigger.Schedule
|
||||
cron: "0 10 1 * *"
|
||||
inputs:
|
||||
taxi: yellow
|
||||
62
02-workflow-orchestration/flows/07_gcp_dbt.yaml
Normal file
62
02-workflow-orchestration/flows/07_gcp_dbt.yaml
Normal file
@ -0,0 +1,62 @@
|
||||
id: 07_gcp_dbt
|
||||
namespace: zoomcamp
|
||||
inputs:
|
||||
- id: dbt_command
|
||||
type: SELECT
|
||||
allowCustomValue: true
|
||||
defaults: dbt build
|
||||
values:
|
||||
- dbt build
|
||||
- dbt debug # use when running the first time to validate DB connection
|
||||
|
||||
tasks:
|
||||
- id: sync
|
||||
type: io.kestra.plugin.git.SyncNamespaceFiles
|
||||
url: https://github.com/DataTalksClub/data-engineering-zoomcamp
|
||||
branch: main
|
||||
namespace: "{{flow.namespace}}"
|
||||
gitDirectory: 04-analytics-engineering/taxi_rides_ny
|
||||
dryRun: false
|
||||
# disabled: true # this Git Sync is needed only when running it the first time, afterwards the task can be disabled
|
||||
|
||||
- id: dbt-build
|
||||
type: io.kestra.plugin.dbt.cli.DbtCLI
|
||||
env:
|
||||
DBT_DATABASE: "{{kv('GCP_PROJECT_ID')}}"
|
||||
DBT_SCHEMA: "{{kv('GCP_DATASET')}}"
|
||||
namespaceFiles:
|
||||
enabled: true
|
||||
containerImage: ghcr.io/kestra-io/dbt-bigquery:latest
|
||||
taskRunner:
|
||||
type: io.kestra.plugin.scripts.runner.docker.Docker
|
||||
inputFiles:
|
||||
sa.json: "{{kv('GCP_CREDS')}}"
|
||||
commands:
|
||||
- dbt deps
|
||||
- "{{ inputs.dbt_command }}"
|
||||
storeManifest:
|
||||
key: manifest.json
|
||||
namespace: "{{ flow.namespace }}"
|
||||
profiles: |
|
||||
default:
|
||||
outputs:
|
||||
dev:
|
||||
type: bigquery
|
||||
dataset: "{{kv('GCP_DATASET')}}"
|
||||
project: "{{kv('GCP_PROJECT_ID')}}"
|
||||
location: "{{kv('GCP_LOCATION')}}"
|
||||
keyfile: sa.json
|
||||
method: service-account
|
||||
priority: interactive
|
||||
threads: 16
|
||||
timeout_seconds: 300
|
||||
fixed_retries: 1
|
||||
target: dev
|
||||
description: |
|
||||
Note that you need to adjust the models/staging/schema.yml file to match your database and schema. Select and edit that Namespace File from the UI. Save and run this flow. Once https://github.com/DataTalksClub/data-engineering-zoomcamp/pull/565/files is merged, you can ignore this note as it will be dynamically adjusted based on env variables.
|
||||
```yaml
|
||||
sources:
|
||||
- name: staging
|
||||
database: kestra-sandbox
|
||||
schema: zoomcamp
|
||||
```
|
||||
57
02-workflow-orchestration/homework.md
Normal file
57
02-workflow-orchestration/homework.md
Normal file
@ -0,0 +1,57 @@
|
||||
## Module 2 Homework
|
||||
|
||||
### Assignment
|
||||
|
||||
So far in the course, we processed data for the year 2019 and 2020. Your task is to extend the existing flows to include data for the year 2021.
|
||||
|
||||

|
||||
|
||||
As a hint, Kestra makes that process really easy:
|
||||
1. You can leverage the backfill functionality in the [scheduled flow](../flows/07_gcp_taxi_scheduled.yaml) to backfill the data for the year 2021. Just make sure to select the time period for which data exists i.e. from `2021-01-01` to `2021-07-31`. Also, make sure to do the same for both `yellow` and `green` taxi data (select the right service in the `taxi` input).
|
||||
2. Alternatively, run the flow manually for each of the seven months of 2021 for both `yellow` and `green` taxi data. Challenge for you: find out how to loop over the combination of Year-Month and `taxi`-type using `ForEach` task which triggers the flow for each combination using a `Subflow` task.
|
||||
|
||||
### Quiz Questions
|
||||
|
||||
Complete the Quiz shown below. It’s a set of 6 multiple-choice questions to test your understanding of workflow orchestration, Kestra and ETL pipelines for data lakes and warehouses.
|
||||
|
||||
1) Within the execution for `Yellow` Taxi data for the year `2020` and month `12`: what is the uncompressed file size (i.e. the output file `yellow_tripdata_2020-12.csv` of the `extract` task)?
|
||||
- 128.3 MB
|
||||
- 134.5 MB
|
||||
- 364.7 MB
|
||||
- 692.6 MB
|
||||
|
||||
2) What is the value of the variable `file` when the inputs `taxi` is set to `green`, `year` is set to `2020`, and `month` is set to `04` during execution?
|
||||
- `{{inputs.taxi}}_tripdata_{{inputs.year}}-{{inputs.month}}.csv`
|
||||
- `green_tripdata_2020-04.csv`
|
||||
- `green_tripdata_04_2020.csv`
|
||||
- `green_tripdata_2020.csv`
|
||||
|
||||
3) How many rows are there for the `Yellow` Taxi data for the year 2020?
|
||||
- 13,537.299
|
||||
- 24,648,499
|
||||
- 18,324,219
|
||||
- 29,430,127
|
||||
|
||||
4) How many rows are there for the `Green` Taxi data for the year 2020?
|
||||
- 5,327,301
|
||||
- 936,199
|
||||
- 1,734,051
|
||||
- 1,342,034
|
||||
|
||||
5) Using dbt on the `Green` and `Yellow` Taxi data for the year 2020, how many rows are there in the `fact_trips` table?
|
||||
- 198
|
||||
- 165
|
||||
- 151
|
||||
- 203
|
||||
|
||||
6) How would you configure the timezone to New York in a Schedule trigger?
|
||||
- Add a `timezone` property set to `EST` in the `Schedule` trigger configuration
|
||||
- Add a `timezone` property set to `America/New_York` in the `Schedule` trigger configuration
|
||||
- Add a `timezone` property set to `UTC-5` in the `Schedule` trigger configuration
|
||||
- Add a `location` property set to `New_York` in the `Schedule` trigger configuration
|
||||
|
||||
|
||||
## Submitting the solutions
|
||||
|
||||
* Form for submitting: https://courses.datatalks.club/de-zoomcamp-2025/homework/hw2
|
||||
* Check the link above to see the due date
|
||||
BIN
02-workflow-orchestration/images/homework.png
Normal file
BIN
02-workflow-orchestration/images/homework.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 716 KiB |
15
02-workflow-orchestration/postgres/docker-compose.yml
Normal file
15
02-workflow-orchestration/postgres/docker-compose.yml
Normal file
@ -0,0 +1,15 @@
|
||||
version: "3.8"
|
||||
services:
|
||||
postgres:
|
||||
image: postgres
|
||||
container_name: postgres-db
|
||||
environment:
|
||||
POSTGRES_USER: kestra
|
||||
POSTGRES_PASSWORD: k3str4
|
||||
POSTGRES_DB: postgres-zoomcamp
|
||||
ports:
|
||||
- "5432:5432"
|
||||
volumes:
|
||||
- postgres-data:/var/lib/postgresql/data
|
||||
volumes:
|
||||
postgres-data:
|
||||
80
03-data-warehouse/README.md
Normal file
80
03-data-warehouse/README.md
Normal file
@ -0,0 +1,80 @@
|
||||
# Data Warehouse and BigQuery
|
||||
|
||||
- [Slides](https://docs.google.com/presentation/d/1a3ZoBAXFk8-EhUsd7rAZd-5p_HpltkzSeujjRGB2TAI/edit?usp=sharing)
|
||||
- [Big Query basic SQL](big_query.sql)
|
||||
|
||||
# Videos
|
||||
|
||||
## Data Warehouse
|
||||
|
||||
- Data Warehouse and BigQuery
|
||||
|
||||
[](https://youtu.be/jrHljAoD6nM&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=34)
|
||||
|
||||
## :movie_camera: Partitoning and clustering
|
||||
|
||||
- Partioning and Clustering
|
||||
|
||||
[](https://youtu.be/-CqXf7vhhDs&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=35)
|
||||
|
||||
- Partioning vs Clustering
|
||||
|
||||
[](https://youtu.be/-CqXf7vhhDs?si=p1sYQCAs8dAa7jIm&t=193&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=35)
|
||||
|
||||
## :movie_camera: Best practices
|
||||
|
||||
[](https://youtu.be/k81mLJVX08w&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=36)
|
||||
|
||||
## :movie_camera: Internals of BigQuery
|
||||
|
||||
[](https://youtu.be/eduHi1inM4s&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=37)
|
||||
|
||||
## Advanced topics
|
||||
|
||||
### :movie_camera: Machine Learning in Big Query
|
||||
|
||||
[](https://youtu.be/B-WtpB0PuG4&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=34)
|
||||
|
||||
* [SQL for ML in BigQuery](big_query_ml.sql)
|
||||
|
||||
**Important links**
|
||||
|
||||
- [BigQuery ML Tutorials](https://cloud.google.com/bigquery-ml/docs/tutorials)
|
||||
- [BigQuery ML Reference Parameter](https://cloud.google.com/bigquery-ml/docs/analytics-reference-patterns)
|
||||
- [Hyper Parameter tuning](https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-create-glm)
|
||||
- [Feature preprocessing](https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-preprocess-overview)
|
||||
|
||||
### :movie_camera: Deploying Machine Learning model from BigQuery
|
||||
|
||||
[](https://youtu.be/BjARzEWaznU&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=39)
|
||||
|
||||
- [Steps to extract and deploy model with docker](extract_model.md)
|
||||
|
||||
|
||||
|
||||
# Homework
|
||||
|
||||
* [2024 Homework](../cohorts/2024/03-data-warehouse/homework.md)
|
||||
|
||||
|
||||
# Community notes
|
||||
|
||||
Did you take notes? You can share them here.
|
||||
|
||||
* [Notes by Alvaro Navas](https://github.com/ziritrion/dataeng-zoomcamp/blob/main/notes/3_data_warehouse.md)
|
||||
* [Isaac Kargar's blog post](https://kargarisaac.github.io/blog/data%20engineering/jupyter/2022/01/30/data-engineering-w3.html)
|
||||
* [Marcos Torregrosa's blog post](https://www.n4gash.com/2023/data-engineering-zoomcamp-semana-3/)
|
||||
* [Notes by Victor Padilha](https://github.com/padilha/de-zoomcamp/tree/master/week3)
|
||||
* [Notes from Xia He-Bleinagel](https://xiahe-bleinagel.com/2023/02/week-3-data-engineering-zoomcamp-notes-data-warehouse-and-bigquery/)
|
||||
* [Bigger picture summary on Data Lakes, Data Warehouses, and tooling](https://medium.com/@verazabeida/zoomcamp-week-4-b8bde661bf98), by Vera
|
||||
* [Notes by froukje](https://github.com/froukje/de-zoomcamp/blob/main/week_3_data_warehouse/notes/notes_week_03.md)
|
||||
* [Notes by Alain Boisvert](https://github.com/boisalai/de-zoomcamp-2023/blob/main/week3.md)
|
||||
* [Notes from Vincenzo Galante](https://binchentso.notion.site/Data-Talks-Club-Data-Engineering-Zoomcamp-8699af8e7ff94ec49e6f9bdec8eb69fd)
|
||||
* [2024 videos transcript week3](https://drive.google.com/drive/folders/1quIiwWO-tJCruqvtlqe_Olw8nvYSmmDJ?usp=sharing) by Maria Fisher
|
||||
* [Notes by Linda](https://github.com/inner-outer-space/de-zoomcamp-2024/blob/main/3a-data-warehouse/readme.md)
|
||||
* [Jonah Oliver's blog post](https://www.jonahboliver.com/blog/de-zc-w3)
|
||||
* [2024 - steps to send data from Mage to GCS + creating external table](https://drive.google.com/file/d/1GIi6xnS4070a8MUlIg-ozITt485_-ePB/view?usp=drive_link) by Maria Fisher
|
||||
* [2024 - mage dataloader script to load the parquet files from a remote URL and push it to Google bucket as parquet file](https://github.com/amohan601/dataengineering-zoomcamp2024/blob/main/week_3_data_warehouse/mage_scripts/green_taxi_2022_v2.py) by Anju Mohan
|
||||
* [2024 - steps to send data from Mage to GCS + creating external table](https://drive.google.com/file/d/1GIi6xnS4070a8MUlIg-ozITt485_-ePB/view?usp=drive_link) by Maria Fisher
|
||||
* [Notes by HongWei](https://github.com/hwchua0209/data-engineering-zoomcamp-submission/blob/main/03-data-warehouse/README.md)
|
||||
* Add your notes here (above this line)
|
||||
140
04-analytics-engineering/README.md
Normal file
140
04-analytics-engineering/README.md
Normal file
@ -0,0 +1,140 @@
|
||||
# Module 4: Analytics Engineering
|
||||
Goal: Transforming the data loaded in DWH into Analytical Views developing a [dbt project](taxi_rides_ny/README.md).
|
||||
|
||||
### Prerequisites
|
||||
By this stage of the course you should have already:
|
||||
|
||||
- A running warehouse (BigQuery or postgres)
|
||||
- A set of running pipelines ingesting the project dataset (week 3 completed)
|
||||
- The following datasets ingested from the course [Datasets list](https://github.com/DataTalksClub/nyc-tlc-data/):
|
||||
* Yellow taxi data - Years 2019 and 2020
|
||||
* Green taxi data - Years 2019 and 2020
|
||||
* fhv data - Year 2019.
|
||||
|
||||
> [!NOTE]
|
||||
> * We have two quick hack to load that data quicker, follow [this video](https://www.youtube.com/watch?v=Mork172sK_c&list=PLaNLNpjZpzwgneiI-Gl8df8GCsPYp_6Bs) for option 1 or check instructions in [week3/extras](../03-data-warehouse/extras) for option 2
|
||||
|
||||
## Setting up your environment
|
||||
|
||||
> [!NOTE]
|
||||
> the *cloud* setup is the preferred option.
|
||||
>
|
||||
> the *local* setup does not require a cloud database.
|
||||
|
||||
| Alternative A | Alternative B |
|
||||
---|---|
|
||||
| Setting up dbt for using BigQuery (cloud) | Setting up dbt for using Postgres locally |
|
||||
|- Open a free developer dbt cloud account following [this link](https://www.getdbt.com/signup/)|- Open a free developer dbt cloud account following [this link](https://www.getdbt.com/signup/)<br><br> |
|
||||
| - [Following these instructions to connect to your BigQuery instance]([https://docs.getdbt.com/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-setting-up-bigquery-oauth](https://docs.getdbt.com/guides/bigquery?step=4)) | - follow the [official dbt documentation]([https://docs.getdbt.com/dbt-cli/installation](https://docs.getdbt.com/docs/core/installation-overview)) or <br>- follow the [dbt core with BigQuery on Docker](docker_setup/README.md) guide to setup dbt locally on docker or <br>- use a docker image from oficial [Install with Docker](https://docs.getdbt.com/docs/core/docker-install). |
|
||||
|- More detailed instructions in [dbt_cloud_setup.md](dbt_cloud_setup.md) | - You will need to install the latest version with the BigQuery adapter (dbt-bigquery).|
|
||||
| | - You will need to install the latest version with the postgres adapter (dbt-postgres).|
|
||||
| | After local installation you will have to set up the connection to PG in the `profiles.yml`, you can find the templates [here](https://docs.getdbt.com/docs/core/connect-data-platform/postgres-setup) |
|
||||
|
||||
|
||||
## Content
|
||||
|
||||
### Introduction to analytics engineering
|
||||
|
||||
* What is analytics engineering?
|
||||
* ETL vs ELT
|
||||
* Data modeling concepts (fact and dim tables)
|
||||
|
||||
[](https://youtu.be/uF76d5EmdtU&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=40)
|
||||
|
||||
### What is dbt?
|
||||
|
||||
* Introduction to dbt
|
||||
|
||||
[](https://www.youtube.com/watch?v=gsKuETFJr54&list=PLaNLNpjZpzwgneiI-Gl8df8GCsPYp_6Bs&index=5)
|
||||
|
||||
## Starting a dbt project
|
||||
|
||||
| Alternative A | Alternative B |
|
||||
|-----------------------------|--------------------------------|
|
||||
| Using BigQuery + dbt cloud | Using Postgres + dbt core (locally) |
|
||||
| - Starting a new project with dbt init (dbt cloud and core)<br>- dbt cloud setup<br>- project.yml<br><br> | - Starting a new project with dbt init (dbt cloud and core)<br>- dbt core local setup<br>- profiles.yml<br>- project.yml |
|
||||
| [](https://www.youtube.com/watch?v=J0XCDyKiU64&list=PLaNLNpjZpzwgneiI-Gl8df8GCsPYp_6Bs&index=4) | [](https://youtu.be/1HmL63e-vRs&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=43) |
|
||||
|
||||
### dbt models
|
||||
|
||||
* Anatomy of a dbt model: written code vs compiled Sources
|
||||
* Materialisations: table, view, incremental, ephemeral
|
||||
* Seeds, sources and ref
|
||||
* Jinja and Macros
|
||||
* Packages
|
||||
* Variables
|
||||
|
||||
[](https://www.youtube.com/watch?v=ueVy2N54lyc&list=PLaNLNpjZpzwgneiI-Gl8df8GCsPYp_6Bs&index=3)
|
||||
|
||||
> [!NOTE]
|
||||
> *This video is shown entirely on dbt cloud IDE but the same steps can be followed locally on the IDE of your choice*
|
||||
|
||||
> [!TIP]
|
||||
>* If you recieve an error stating "Permission denied while globbing file pattern." when attempting to run `fact_trips.sql` this video may be helpful in resolving the issue
|
||||
>
|
||||
>[](https://youtu.be/kL3ZVNL9Y4A&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=34)
|
||||
|
||||
### Testing and documenting dbt models
|
||||
* Tests
|
||||
* Documentation
|
||||
|
||||
[](https://www.youtube.com/watch?v=2dNJXHFCHaY&list=PLaNLNpjZpzwgneiI-Gl8df8GCsPYp_6Bs&index=2)
|
||||
|
||||
>[!NOTE]
|
||||
> *This video is shown entirely on dbt cloud IDE but the same steps can be followed locally on the IDE of your choice*
|
||||
|
||||
## Deployment
|
||||
|
||||
| Alternative A | Alternative B |
|
||||
|-----------------------------|--------------------------------|
|
||||
| Using BigQuery + dbt cloud | Using Postgres + dbt core (locally) |
|
||||
| - Deployment: development environment vs production<br>- dbt cloud: scheduler, sources and hosted documentation | - Deployment: development environment vs production<br>- dbt cloud: scheduler, sources and hosted documentation |
|
||||
| [](https://www.youtube.com/watch?v=V2m5C0n8Gro&list=PLaNLNpjZpzwgneiI-Gl8df8GCsPYp_6Bs&index=6) | [](https://youtu.be/Cs9Od1pcrzM&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=47) |
|
||||
|
||||
## Visualising the transformed data
|
||||
|
||||
:movie_camera: Google data studio Video (Now renamed to Looker studio)
|
||||
|
||||
[](https://youtu.be/39nLTs74A3E&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=48)
|
||||
|
||||
:movie_camera: Metabase Video
|
||||
|
||||
[](https://youtu.be/BnLkrA7a6gM&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=49)
|
||||
|
||||
|
||||
## Advanced concepts
|
||||
|
||||
* [Make a model Incremental](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/configuring-incremental-models)
|
||||
* [Use of tags](https://docs.getdbt.com/reference/resource-configs/tags)
|
||||
* [Hooks](https://docs.getdbt.com/docs/building-a-dbt-project/hooks-operations)
|
||||
* [Analysis](https://docs.getdbt.com/docs/building-a-dbt-project/analyses)
|
||||
* [Snapshots](https://docs.getdbt.com/docs/building-a-dbt-project/snapshots)
|
||||
* [Exposure](https://docs.getdbt.com/docs/building-a-dbt-project/exposures)
|
||||
* [Metrics](https://docs.getdbt.com/docs/building-a-dbt-project/metrics)
|
||||
|
||||
|
||||
## Community notes
|
||||
|
||||
Did you take notes? You can share them here.
|
||||
|
||||
* [Notes by Alvaro Navas](https://github.com/ziritrion/dataeng-zoomcamp/blob/main/notes/4_analytics.md)
|
||||
* [Sandy's DE learning blog](https://learningdataengineering540969211.wordpress.com/2022/02/17/week-4-setting-up-dbt-cloud-with-bigquery/)
|
||||
* [Notes by Victor Padilha](https://github.com/padilha/de-zoomcamp/tree/master/week4)
|
||||
* [Marcos Torregrosa's blog (spanish)](https://www.n4gash.com/2023/data-engineering-zoomcamp-semana-4/)
|
||||
* [Notes by froukje](https://github.com/froukje/de-zoomcamp/blob/main/week_4_analytics_engineering/notes/notes_week_04.md)
|
||||
* [Notes by Alain Boisvert](https://github.com/boisalai/de-zoomcamp-2023/blob/main/week4.md)
|
||||
* [Setting up Prefect with dbt by Vera](https://medium.com/@verazabeida/zoomcamp-week-5-5b6a9d53a3a0)
|
||||
* [Blog by Xia He-Bleinagel](https://xiahe-bleinagel.com/2023/02/week-4-data-engineering-zoomcamp-notes-analytics-engineering-and-dbt/)
|
||||
* [Setting up DBT with BigQuery by Tofag](https://medium.com/@fagbuyit/setting-up-your-dbt-cloud-dej-9-d18e5b7c96ba)
|
||||
* [Blog post by Dewi Oktaviani](https://medium.com/@oktavianidewi/de-zoomcamp-2023-learning-week-4-analytics-engineering-with-dbt-53f781803d3e)
|
||||
* [Notes from Vincenzo Galante](https://binchentso.notion.site/Data-Talks-Club-Data-Engineering-Zoomcamp-8699af8e7ff94ec49e6f9bdec8eb69fd)
|
||||
* [Notes from Balaji](https://github.com/Balajirvp/DE-Zoomcamp/blob/main/Week%204/Data%20Engineering%20Zoomcamp%20Week%204.ipynb)
|
||||
* [Notes by Linda](https://github.com/inner-outer-space/de-zoomcamp-2024/blob/main/4-analytics-engineering/readme.md)
|
||||
* [2024 - Videos transcript week4](https://drive.google.com/drive/folders/1V2sHWOotPEMQTdMT4IMki1fbMPTn3jOP?usp=drive)
|
||||
* [Blog Post](https://www.jonahboliver.com/blog/de-zc-w4) by Jonah Oliver
|
||||
* Add your notes here (above this line)
|
||||
|
||||
## Useful links
|
||||
- [Slides used in the videos](https://docs.google.com/presentation/d/1xSll_jv0T8JF4rYZvLHfkJXYqUjPtThA/edit?usp=sharing&ouid=114544032874539580154&rtpof=true&sd=true)
|
||||
- [Visualizing data with Metabase course](https://www.metabase.com/learn/visualization/)
|
||||
- [dbt free courses](https://courses.getdbt.com/collections)
|
||||
5
04-analytics-engineering/taxi_rides_ny/.gitignore
vendored
Normal file
5
04-analytics-engineering/taxi_rides_ny/.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
# you shouldn't commit these into source control
|
||||
# these are the default directory names, adjust/add to fit your needs
|
||||
target/
|
||||
dbt_packages/
|
||||
logs/
|
||||
@ -35,4 +35,4 @@ _Alternative: use `$ dbt build` to execute with one command the 3 steps above to
|
||||
- Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
|
||||
- Join the [chat](http://slack.getdbt.com/) on Slack for live discussions and support
|
||||
- Find [dbt events](https://events.getdbt.com) near you
|
||||
- Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
|
||||
- Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
|
||||
@ -0,0 +1,49 @@
|
||||
-- MAKE SURE YOU REPLACE taxi-rides-ny-339813-412521 WITH THE NAME OF YOUR DATASET!
|
||||
-- When you run the query, only run 5 of the ALTER TABLE statements at one time (by highlighting only 5).
|
||||
-- Otherwise BigQuery will say too many alterations to the table are being made.
|
||||
|
||||
CREATE TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata` as
|
||||
SELECT * FROM `bigquery-public-data.new_york_taxi_trips.tlc_green_trips_2019`;
|
||||
|
||||
|
||||
CREATE TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata` as
|
||||
SELECT * FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2019`;
|
||||
|
||||
insert into `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata`
|
||||
SELECT * FROM `bigquery-public-data.new_york_taxi_trips.tlc_green_trips_2020` ;
|
||||
|
||||
|
||||
insert into `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata`
|
||||
SELECT * FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2020`;
|
||||
|
||||
-- Fixes yellow table schema
|
||||
ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata`
|
||||
RENAME COLUMN vendor_id TO VendorID;
|
||||
ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata`
|
||||
RENAME COLUMN pickup_datetime TO tpep_pickup_datetime;
|
||||
ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata`
|
||||
RENAME COLUMN dropoff_datetime TO tpep_dropoff_datetime;
|
||||
ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata`
|
||||
RENAME COLUMN rate_code TO RatecodeID;
|
||||
ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata`
|
||||
RENAME COLUMN imp_surcharge TO improvement_surcharge;
|
||||
ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata`
|
||||
RENAME COLUMN pickup_location_id TO PULocationID;
|
||||
ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata`
|
||||
RENAME COLUMN dropoff_location_id TO DOLocationID;
|
||||
|
||||
-- Fixes green table schema
|
||||
ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata`
|
||||
RENAME COLUMN vendor_id TO VendorID;
|
||||
ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata`
|
||||
RENAME COLUMN pickup_datetime TO lpep_pickup_datetime;
|
||||
ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata`
|
||||
RENAME COLUMN dropoff_datetime TO lpep_dropoff_datetime;
|
||||
ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata`
|
||||
RENAME COLUMN rate_code TO RatecodeID;
|
||||
ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata`
|
||||
RENAME COLUMN imp_surcharge TO improvement_surcharge;
|
||||
ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata`
|
||||
RENAME COLUMN pickup_location_id TO PULocationID;
|
||||
ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata`
|
||||
RENAME COLUMN dropoff_location_id TO DOLocationID;
|
||||
@ -7,13 +7,13 @@ version: '1.0.0'
|
||||
config-version: 2
|
||||
|
||||
# This setting configures which "profile" dbt uses for this project.
|
||||
profile: 'pg-dbt-workshop'
|
||||
profile: 'default'
|
||||
|
||||
# These configurations specify where dbt should look for different types of files.
|
||||
# The `source-paths` config, for example, states that models in this project can be
|
||||
# The `model-paths` config, for example, states that models in this project can be
|
||||
# found in the "models/" directory. You probably won't need to change these!
|
||||
model-paths: ["models"]
|
||||
analysis-paths: ["analysis"]
|
||||
analysis-paths: ["analyses"]
|
||||
test-paths: ["tests"]
|
||||
seed-paths: ["seeds"]
|
||||
macro-paths: ["macros"]
|
||||
@ -21,17 +21,20 @@ snapshot-paths: ["snapshots"]
|
||||
|
||||
target-path: "target" # directory which will store compiled SQL files
|
||||
clean-targets: # directories to be removed by `dbt clean`
|
||||
- "target"
|
||||
- "dbt_packages"
|
||||
- "dbt_modules"
|
||||
- "target"
|
||||
- "dbt_packages"
|
||||
|
||||
|
||||
# Configuring models
|
||||
# Full documentation: https://docs.getdbt.com/docs/configuring-models
|
||||
|
||||
# In this example config, we tell dbt to build all models in the example/ directory
|
||||
# as tables. These settings can be overridden in the individual model files
|
||||
# In dbt, the default materialization for a model is a view. This means, when you run
|
||||
# dbt run or dbt build, all of your models will be built as a view in your data platform.
|
||||
# The configuration below will override this setting for models in the example folder to
|
||||
# instead be materialized as tables. Any models you add to the root of the models folder will
|
||||
# continue to be built as views. These settings can be overridden in the individual model files
|
||||
# using the `{{ config(...) }}` macro.
|
||||
|
||||
models:
|
||||
taxi_rides_ny:
|
||||
# Applies to all files under models/.../
|
||||
@ -46,4 +49,4 @@ seeds:
|
||||
taxi_rides_ny:
|
||||
taxi_zone_lookup:
|
||||
+column_types:
|
||||
locationid: numeric
|
||||
locationid: numeric
|
||||
@ -1,18 +1,17 @@
|
||||
{#
|
||||
{#
|
||||
This macro returns the description of the payment_type
|
||||
#}
|
||||
|
||||
{% macro get_payment_type_description(payment_type) -%}
|
||||
|
||||
case {{ payment_type }}
|
||||
case {{ dbt.safe_cast("payment_type", api.Column.translate_type("integer")) }}
|
||||
when 1 then 'Credit card'
|
||||
when 2 then 'Cash'
|
||||
when 3 then 'No charge'
|
||||
when 4 then 'Dispute'
|
||||
when 5 then 'Unknown'
|
||||
when 6 then 'Voided trip'
|
||||
else 'EMPTY'
|
||||
end
|
||||
|
||||
{%- endmacro %}
|
||||
|
||||
|
||||
{%- endmacro %}
|
||||
@ -1,9 +1,8 @@
|
||||
{{ config(materialized='table') }}
|
||||
|
||||
|
||||
select
|
||||
locationid,
|
||||
borough,
|
||||
zone,
|
||||
replace(service_zone,'Boro','Green') as service_zone
|
||||
replace(service_zone,'Boro','Green') as service_zone
|
||||
from {{ ref('taxi_zone_lookup') }}
|
||||
@ -6,8 +6,7 @@ with trips_data as (
|
||||
select
|
||||
-- Reveneue grouping
|
||||
pickup_zone as revenue_zone,
|
||||
date_trunc('month', pickup_datetime) as revenue_month,
|
||||
--Note: For BQ use instead: date_trunc(pickup_datetime, month) as revenue_month,
|
||||
{{ dbt.date_trunc("month", "pickup_datetime") }} as revenue_month,
|
||||
|
||||
service_type,
|
||||
|
||||
@ -20,12 +19,11 @@ with trips_data as (
|
||||
sum(ehail_fee) as revenue_monthly_ehail_fee,
|
||||
sum(improvement_surcharge) as revenue_monthly_improvement_surcharge,
|
||||
sum(total_amount) as revenue_monthly_total_amount,
|
||||
sum(congestion_surcharge) as revenue_monthly_congestion_surcharge,
|
||||
|
||||
-- Additional calculations
|
||||
count(tripid) as total_monthly_trips,
|
||||
avg(passenger_count) as avg_montly_passenger_count,
|
||||
avg(trip_distance) as avg_montly_trip_distance
|
||||
avg(passenger_count) as avg_monthly_passenger_count,
|
||||
avg(trip_distance) as avg_monthly_trip_distance
|
||||
|
||||
from trips_data
|
||||
group by 1,2,3
|
||||
group by 1,2,3
|
||||
@ -1,29 +1,29 @@
|
||||
{{ config(materialized='table') }}
|
||||
{{
|
||||
config(
|
||||
materialized='table'
|
||||
)
|
||||
}}
|
||||
|
||||
with green_data as (
|
||||
with green_tripdata as (
|
||||
select *,
|
||||
'Green' as service_type
|
||||
'Green' as service_type
|
||||
from {{ ref('stg_green_tripdata') }}
|
||||
),
|
||||
|
||||
yellow_data as (
|
||||
yellow_tripdata as (
|
||||
select *,
|
||||
'Yellow' as service_type
|
||||
from {{ ref('stg_yellow_tripdata') }}
|
||||
),
|
||||
|
||||
trips_unioned as (
|
||||
select * from green_data
|
||||
union all
|
||||
select * from yellow_data
|
||||
select * from green_tripdata
|
||||
union all
|
||||
select * from yellow_tripdata
|
||||
),
|
||||
|
||||
dim_zones as (
|
||||
select * from {{ ref('dim_zones') }}
|
||||
where borough != 'Unknown'
|
||||
)
|
||||
select
|
||||
trips_unioned.tripid,
|
||||
select trips_unioned.tripid,
|
||||
trips_unioned.vendorid,
|
||||
trips_unioned.service_type,
|
||||
trips_unioned.ratecodeid,
|
||||
@ -48,10 +48,9 @@ select
|
||||
trips_unioned.improvement_surcharge,
|
||||
trips_unioned.total_amount,
|
||||
trips_unioned.payment_type,
|
||||
trips_unioned.payment_type_description,
|
||||
trips_unioned.congestion_surcharge
|
||||
trips_unioned.payment_type_description
|
||||
from trips_unioned
|
||||
inner join dim_zones as pickup_zone
|
||||
on trips_unioned.pickup_locationid = pickup_zone.locationid
|
||||
inner join dim_zones as dropoff_zone
|
||||
on trips_unioned.dropoff_locationid = dropoff_zone.locationid
|
||||
on trips_unioned.dropoff_locationid = dropoff_zone.locationid
|
||||
129
04-analytics-engineering/taxi_rides_ny/models/core/schema.yml
Normal file
129
04-analytics-engineering/taxi_rides_ny/models/core/schema.yml
Normal file
@ -0,0 +1,129 @@
|
||||
version: 2
|
||||
|
||||
models:
|
||||
- name: dim_zones
|
||||
description: >
|
||||
List of unique zones idefied by locationid.
|
||||
Includes the service zone they correspond to (Green or yellow).
|
||||
|
||||
- name: dm_monthly_zone_revenue
|
||||
description: >
|
||||
Aggregated table of all taxi trips corresponding to both service zones (Green and yellow) per pickup zone, month and service.
|
||||
The table contains monthly sums of the fare elements used to calculate the monthly revenue.
|
||||
The table contains also monthly indicators like number of trips, and average trip distance.
|
||||
columns:
|
||||
- name: revenue_monthly_total_amount
|
||||
description: Monthly sum of the the total_amount of the fare charged for the trip per pickup zone, month and service.
|
||||
tests:
|
||||
- not_null:
|
||||
severity: error
|
||||
|
||||
- name: fact_trips
|
||||
description: >
|
||||
Taxi trips corresponding to both service zones (Green and yellow).
|
||||
The table contains records where both pickup and dropoff locations are valid and known zones.
|
||||
Each record corresponds to a trip uniquely identified by tripid.
|
||||
columns:
|
||||
- name: tripid
|
||||
data_type: string
|
||||
description: "unique identifier conformed by the combination of vendorid and pickyp time"
|
||||
|
||||
- name: vendorid
|
||||
data_type: int64
|
||||
description: ""
|
||||
|
||||
- name: service_type
|
||||
data_type: string
|
||||
description: ""
|
||||
|
||||
- name: ratecodeid
|
||||
data_type: int64
|
||||
description: ""
|
||||
|
||||
- name: pickup_locationid
|
||||
data_type: int64
|
||||
description: ""
|
||||
|
||||
- name: pickup_borough
|
||||
data_type: string
|
||||
description: ""
|
||||
|
||||
- name: pickup_zone
|
||||
data_type: string
|
||||
description: ""
|
||||
|
||||
- name: dropoff_locationid
|
||||
data_type: int64
|
||||
description: ""
|
||||
|
||||
- name: dropoff_borough
|
||||
data_type: string
|
||||
description: ""
|
||||
|
||||
- name: dropoff_zone
|
||||
data_type: string
|
||||
description: ""
|
||||
|
||||
- name: pickup_datetime
|
||||
data_type: timestamp
|
||||
description: ""
|
||||
|
||||
- name: dropoff_datetime
|
||||
data_type: timestamp
|
||||
description: ""
|
||||
|
||||
- name: store_and_fwd_flag
|
||||
data_type: string
|
||||
description: ""
|
||||
|
||||
- name: passenger_count
|
||||
data_type: int64
|
||||
description: ""
|
||||
|
||||
- name: trip_distance
|
||||
data_type: numeric
|
||||
description: ""
|
||||
|
||||
- name: trip_type
|
||||
data_type: int64
|
||||
description: ""
|
||||
|
||||
- name: fare_amount
|
||||
data_type: numeric
|
||||
description: ""
|
||||
|
||||
- name: extra
|
||||
data_type: numeric
|
||||
description: ""
|
||||
|
||||
- name: mta_tax
|
||||
data_type: numeric
|
||||
description: ""
|
||||
|
||||
- name: tip_amount
|
||||
data_type: numeric
|
||||
description: ""
|
||||
|
||||
- name: tolls_amount
|
||||
data_type: numeric
|
||||
description: ""
|
||||
|
||||
- name: ehail_fee
|
||||
data_type: numeric
|
||||
description: ""
|
||||
|
||||
- name: improvement_surcharge
|
||||
data_type: numeric
|
||||
description: ""
|
||||
|
||||
- name: total_amount
|
||||
data_type: numeric
|
||||
description: ""
|
||||
|
||||
- name: payment_type
|
||||
data_type: int64
|
||||
description: ""
|
||||
|
||||
- name: payment_type_description
|
||||
data_type: string
|
||||
description: ""
|
||||
@ -1,20 +1,13 @@
|
||||
|
||||
version: 2
|
||||
|
||||
sources:
|
||||
- name: staging
|
||||
#For bigquery:
|
||||
#database: taxi-rides-ny-339813
|
||||
|
||||
# For postgres:
|
||||
database: production
|
||||
|
||||
schema: trips_data_all
|
||||
|
||||
- name: staging
|
||||
database: "{{ env_var('DBT_DATABASE', 'taxi-rides-ny-339813-412521') }}"
|
||||
schema: "{{ env_var('DBT_SCHEMA', 'trips_data_all') }}"
|
||||
# loaded_at_field: record_loaded_at
|
||||
tables:
|
||||
- name: green_tripdata
|
||||
- name: yellow_tripdata
|
||||
tables:
|
||||
- name: green_tripdata
|
||||
- name: yellow_tripdata
|
||||
# freshness:
|
||||
# error_after: {count: 6, period: hour}
|
||||
|
||||
@ -75,7 +68,7 @@ models:
|
||||
memory before sending to the vendor, aka “store and forward,”
|
||||
because the vehicle did not have a connection to the server.
|
||||
Y= store and forward trip
|
||||
N= not a store and forward trip
|
||||
N = not a store and forward trip
|
||||
- name: Dropoff_longitude
|
||||
description: Longitude where the meter was disengaged.
|
||||
- name: Dropoff_latitude
|
||||
@ -200,4 +193,4 @@ models:
|
||||
- name: Tolls_amount
|
||||
description: Total amount of all tolls paid in trip.
|
||||
- name: Total_amount
|
||||
description: The total amount charged to passengers. Does not include cash tips.
|
||||
description: The total amount charged to passengers. Does not include cash tips.
|
||||
@ -0,0 +1,52 @@
|
||||
{{
|
||||
config(
|
||||
materialized='view'
|
||||
)
|
||||
}}
|
||||
|
||||
with tripdata as
|
||||
(
|
||||
select *,
|
||||
row_number() over(partition by vendorid, lpep_pickup_datetime) as rn
|
||||
from {{ source('staging','green_tripdata') }}
|
||||
where vendorid is not null
|
||||
)
|
||||
select
|
||||
-- identifiers
|
||||
{{ dbt_utils.generate_surrogate_key(['vendorid', 'lpep_pickup_datetime']) }} as tripid,
|
||||
{{ dbt.safe_cast("vendorid", api.Column.translate_type("integer")) }} as vendorid,
|
||||
{{ dbt.safe_cast("ratecodeid", api.Column.translate_type("integer")) }} as ratecodeid,
|
||||
{{ dbt.safe_cast("pulocationid", api.Column.translate_type("integer")) }} as pickup_locationid,
|
||||
{{ dbt.safe_cast("dolocationid", api.Column.translate_type("integer")) }} as dropoff_locationid,
|
||||
|
||||
-- timestamps
|
||||
cast(lpep_pickup_datetime as timestamp) as pickup_datetime,
|
||||
cast(lpep_dropoff_datetime as timestamp) as dropoff_datetime,
|
||||
|
||||
-- trip info
|
||||
store_and_fwd_flag,
|
||||
{{ dbt.safe_cast("passenger_count", api.Column.translate_type("integer")) }} as passenger_count,
|
||||
cast(trip_distance as numeric) as trip_distance,
|
||||
{{ dbt.safe_cast("trip_type", api.Column.translate_type("integer")) }} as trip_type,
|
||||
|
||||
-- payment info
|
||||
cast(fare_amount as numeric) as fare_amount,
|
||||
cast(extra as numeric) as extra,
|
||||
cast(mta_tax as numeric) as mta_tax,
|
||||
cast(tip_amount as numeric) as tip_amount,
|
||||
cast(tolls_amount as numeric) as tolls_amount,
|
||||
cast(ehail_fee as numeric) as ehail_fee,
|
||||
cast(improvement_surcharge as numeric) as improvement_surcharge,
|
||||
cast(total_amount as numeric) as total_amount,
|
||||
coalesce({{ dbt.safe_cast("payment_type", api.Column.translate_type("integer")) }},0) as payment_type,
|
||||
{{ get_payment_type_description("payment_type") }} as payment_type_description
|
||||
from tripdata
|
||||
where rn = 1
|
||||
|
||||
|
||||
-- dbt build --select <model_name> --vars '{'is_test_run': 'false'}'
|
||||
{% if var('is_test_run', default=true) %}
|
||||
|
||||
limit 100
|
||||
|
||||
{% endif %}
|
||||
@ -9,19 +9,19 @@ with tripdata as
|
||||
)
|
||||
select
|
||||
-- identifiers
|
||||
{{ dbt_utils.surrogate_key(['vendorid', 'tpep_pickup_datetime']) }} as tripid,
|
||||
cast(vendorid as integer) as vendorid,
|
||||
cast(ratecodeid as integer) as ratecodeid,
|
||||
cast(pulocationid as integer) as pickup_locationid,
|
||||
cast(dolocationid as integer) as dropoff_locationid,
|
||||
|
||||
{{ dbt_utils.generate_surrogate_key(['vendorid', 'tpep_pickup_datetime']) }} as tripid,
|
||||
{{ dbt.safe_cast("vendorid", api.Column.translate_type("integer")) }} as vendorid,
|
||||
{{ dbt.safe_cast("ratecodeid", api.Column.translate_type("integer")) }} as ratecodeid,
|
||||
{{ dbt.safe_cast("pulocationid", api.Column.translate_type("integer")) }} as pickup_locationid,
|
||||
{{ dbt.safe_cast("dolocationid", api.Column.translate_type("integer")) }} as dropoff_locationid,
|
||||
|
||||
-- timestamps
|
||||
cast(tpep_pickup_datetime as timestamp) as pickup_datetime,
|
||||
cast(tpep_dropoff_datetime as timestamp) as dropoff_datetime,
|
||||
|
||||
-- trip info
|
||||
store_and_fwd_flag,
|
||||
cast(passenger_count as integer) as passenger_count,
|
||||
{{ dbt.safe_cast("passenger_count", api.Column.translate_type("integer")) }} as passenger_count,
|
||||
cast(trip_distance as numeric) as trip_distance,
|
||||
-- yellow cabs are always street-hail
|
||||
1 as trip_type,
|
||||
@ -35,16 +35,14 @@ select
|
||||
cast(0 as numeric) as ehail_fee,
|
||||
cast(improvement_surcharge as numeric) as improvement_surcharge,
|
||||
cast(total_amount as numeric) as total_amount,
|
||||
cast(payment_type as integer) as payment_type,
|
||||
{{ get_payment_type_description('payment_type') }} as payment_type_description,
|
||||
cast(congestion_surcharge as numeric) as congestion_surcharge
|
||||
coalesce({{ dbt.safe_cast("payment_type", api.Column.translate_type("integer")) }},0) as payment_type,
|
||||
{{ get_payment_type_description('payment_type') }} as payment_type_description
|
||||
from tripdata
|
||||
where rn = 1
|
||||
|
||||
-- dbt build --m <model.sql> --var 'is_test_run: false'
|
||||
-- dbt build --select <model.sql> --vars '{'is_test_run: false}'
|
||||
{% if var('is_test_run', default=true) %}
|
||||
|
||||
limit 100
|
||||
|
||||
{% endif %}
|
||||
|
||||
{% endif %}
|
||||
6
04-analytics-engineering/taxi_rides_ny/package-lock.yml
Normal file
6
04-analytics-engineering/taxi_rides_ny/package-lock.yml
Normal file
@ -0,0 +1,6 @@
|
||||
packages:
|
||||
- package: dbt-labs/dbt_utils
|
||||
version: 1.1.1
|
||||
- package: dbt-labs/codegen
|
||||
version: 0.12.1
|
||||
sha1_hash: d974113b0f072cce35300077208f38581075ab40
|
||||
5
04-analytics-engineering/taxi_rides_ny/packages.yml
Normal file
5
04-analytics-engineering/taxi_rides_ny/packages.yml
Normal file
@ -0,0 +1,5 @@
|
||||
packages:
|
||||
- package: dbt-labs/dbt_utils
|
||||
version: 1.1.1
|
||||
- package: dbt-labs/codegen
|
||||
version: 0.12.1
|
||||
@ -6,5 +6,4 @@ seeds:
|
||||
Taxi Zones roughly based on NYC Department of City Planning's Neighborhood
|
||||
Tabulation Areas (NTAs) and are meant to approximate neighborhoods, so you can see which
|
||||
neighborhood a passenger was picked up in, and which neighborhood they were dropped off in.
|
||||
Includes associated service_zone (EWR, Boro Zone, Yellow Zone)
|
||||
|
||||
Includes associated service_zone (EWR, Boro Zone, Yellow Zone)
|
||||
@ -1,266 +1,266 @@
|
||||
"locationid","borough","zone","service_zone"
|
||||
1,"EWR","Newark Airport","EWR"
|
||||
2,"Queens","Jamaica Bay","Boro Zone"
|
||||
3,"Bronx","Allerton/Pelham Gardens","Boro Zone"
|
||||
4,"Manhattan","Alphabet City","Yellow Zone"
|
||||
5,"Staten Island","Arden Heights","Boro Zone"
|
||||
6,"Staten Island","Arrochar/Fort Wadsworth","Boro Zone"
|
||||
7,"Queens","Astoria","Boro Zone"
|
||||
8,"Queens","Astoria Park","Boro Zone"
|
||||
9,"Queens","Auburndale","Boro Zone"
|
||||
10,"Queens","Baisley Park","Boro Zone"
|
||||
11,"Brooklyn","Bath Beach","Boro Zone"
|
||||
12,"Manhattan","Battery Park","Yellow Zone"
|
||||
13,"Manhattan","Battery Park City","Yellow Zone"
|
||||
14,"Brooklyn","Bay Ridge","Boro Zone"
|
||||
15,"Queens","Bay Terrace/Fort Totten","Boro Zone"
|
||||
16,"Queens","Bayside","Boro Zone"
|
||||
17,"Brooklyn","Bedford","Boro Zone"
|
||||
18,"Bronx","Bedford Park","Boro Zone"
|
||||
19,"Queens","Bellerose","Boro Zone"
|
||||
20,"Bronx","Belmont","Boro Zone"
|
||||
21,"Brooklyn","Bensonhurst East","Boro Zone"
|
||||
22,"Brooklyn","Bensonhurst West","Boro Zone"
|
||||
23,"Staten Island","Bloomfield/Emerson Hill","Boro Zone"
|
||||
24,"Manhattan","Bloomingdale","Yellow Zone"
|
||||
25,"Brooklyn","Boerum Hill","Boro Zone"
|
||||
26,"Brooklyn","Borough Park","Boro Zone"
|
||||
27,"Queens","Breezy Point/Fort Tilden/Riis Beach","Boro Zone"
|
||||
28,"Queens","Briarwood/Jamaica Hills","Boro Zone"
|
||||
29,"Brooklyn","Brighton Beach","Boro Zone"
|
||||
30,"Queens","Broad Channel","Boro Zone"
|
||||
31,"Bronx","Bronx Park","Boro Zone"
|
||||
32,"Bronx","Bronxdale","Boro Zone"
|
||||
33,"Brooklyn","Brooklyn Heights","Boro Zone"
|
||||
34,"Brooklyn","Brooklyn Navy Yard","Boro Zone"
|
||||
35,"Brooklyn","Brownsville","Boro Zone"
|
||||
36,"Brooklyn","Bushwick North","Boro Zone"
|
||||
37,"Brooklyn","Bushwick South","Boro Zone"
|
||||
38,"Queens","Cambria Heights","Boro Zone"
|
||||
39,"Brooklyn","Canarsie","Boro Zone"
|
||||
40,"Brooklyn","Carroll Gardens","Boro Zone"
|
||||
41,"Manhattan","Central Harlem","Boro Zone"
|
||||
42,"Manhattan","Central Harlem North","Boro Zone"
|
||||
43,"Manhattan","Central Park","Yellow Zone"
|
||||
44,"Staten Island","Charleston/Tottenville","Boro Zone"
|
||||
45,"Manhattan","Chinatown","Yellow Zone"
|
||||
46,"Bronx","City Island","Boro Zone"
|
||||
47,"Bronx","Claremont/Bathgate","Boro Zone"
|
||||
48,"Manhattan","Clinton East","Yellow Zone"
|
||||
49,"Brooklyn","Clinton Hill","Boro Zone"
|
||||
50,"Manhattan","Clinton West","Yellow Zone"
|
||||
51,"Bronx","Co-Op City","Boro Zone"
|
||||
52,"Brooklyn","Cobble Hill","Boro Zone"
|
||||
53,"Queens","College Point","Boro Zone"
|
||||
54,"Brooklyn","Columbia Street","Boro Zone"
|
||||
55,"Brooklyn","Coney Island","Boro Zone"
|
||||
56,"Queens","Corona","Boro Zone"
|
||||
57,"Queens","Corona","Boro Zone"
|
||||
58,"Bronx","Country Club","Boro Zone"
|
||||
59,"Bronx","Crotona Park","Boro Zone"
|
||||
60,"Bronx","Crotona Park East","Boro Zone"
|
||||
61,"Brooklyn","Crown Heights North","Boro Zone"
|
||||
62,"Brooklyn","Crown Heights South","Boro Zone"
|
||||
63,"Brooklyn","Cypress Hills","Boro Zone"
|
||||
64,"Queens","Douglaston","Boro Zone"
|
||||
65,"Brooklyn","Downtown Brooklyn/MetroTech","Boro Zone"
|
||||
66,"Brooklyn","DUMBO/Vinegar Hill","Boro Zone"
|
||||
67,"Brooklyn","Dyker Heights","Boro Zone"
|
||||
68,"Manhattan","East Chelsea","Yellow Zone"
|
||||
69,"Bronx","East Concourse/Concourse Village","Boro Zone"
|
||||
70,"Queens","East Elmhurst","Boro Zone"
|
||||
71,"Brooklyn","East Flatbush/Farragut","Boro Zone"
|
||||
72,"Brooklyn","East Flatbush/Remsen Village","Boro Zone"
|
||||
73,"Queens","East Flushing","Boro Zone"
|
||||
74,"Manhattan","East Harlem North","Boro Zone"
|
||||
75,"Manhattan","East Harlem South","Boro Zone"
|
||||
76,"Brooklyn","East New York","Boro Zone"
|
||||
77,"Brooklyn","East New York/Pennsylvania Avenue","Boro Zone"
|
||||
78,"Bronx","East Tremont","Boro Zone"
|
||||
79,"Manhattan","East Village","Yellow Zone"
|
||||
80,"Brooklyn","East Williamsburg","Boro Zone"
|
||||
81,"Bronx","Eastchester","Boro Zone"
|
||||
82,"Queens","Elmhurst","Boro Zone"
|
||||
83,"Queens","Elmhurst/Maspeth","Boro Zone"
|
||||
84,"Staten Island","Eltingville/Annadale/Prince's Bay","Boro Zone"
|
||||
85,"Brooklyn","Erasmus","Boro Zone"
|
||||
86,"Queens","Far Rockaway","Boro Zone"
|
||||
87,"Manhattan","Financial District North","Yellow Zone"
|
||||
88,"Manhattan","Financial District South","Yellow Zone"
|
||||
89,"Brooklyn","Flatbush/Ditmas Park","Boro Zone"
|
||||
90,"Manhattan","Flatiron","Yellow Zone"
|
||||
91,"Brooklyn","Flatlands","Boro Zone"
|
||||
92,"Queens","Flushing","Boro Zone"
|
||||
93,"Queens","Flushing Meadows-Corona Park","Boro Zone"
|
||||
94,"Bronx","Fordham South","Boro Zone"
|
||||
95,"Queens","Forest Hills","Boro Zone"
|
||||
96,"Queens","Forest Park/Highland Park","Boro Zone"
|
||||
97,"Brooklyn","Fort Greene","Boro Zone"
|
||||
98,"Queens","Fresh Meadows","Boro Zone"
|
||||
99,"Staten Island","Freshkills Park","Boro Zone"
|
||||
100,"Manhattan","Garment District","Yellow Zone"
|
||||
101,"Queens","Glen Oaks","Boro Zone"
|
||||
102,"Queens","Glendale","Boro Zone"
|
||||
103,"Manhattan","Governor's Island/Ellis Island/Liberty Island","Yellow Zone"
|
||||
104,"Manhattan","Governor's Island/Ellis Island/Liberty Island","Yellow Zone"
|
||||
105,"Manhattan","Governor's Island/Ellis Island/Liberty Island","Yellow Zone"
|
||||
106,"Brooklyn","Gowanus","Boro Zone"
|
||||
107,"Manhattan","Gramercy","Yellow Zone"
|
||||
108,"Brooklyn","Gravesend","Boro Zone"
|
||||
109,"Staten Island","Great Kills","Boro Zone"
|
||||
110,"Staten Island","Great Kills Park","Boro Zone"
|
||||
111,"Brooklyn","Green-Wood Cemetery","Boro Zone"
|
||||
112,"Brooklyn","Greenpoint","Boro Zone"
|
||||
113,"Manhattan","Greenwich Village North","Yellow Zone"
|
||||
114,"Manhattan","Greenwich Village South","Yellow Zone"
|
||||
115,"Staten Island","Grymes Hill/Clifton","Boro Zone"
|
||||
116,"Manhattan","Hamilton Heights","Boro Zone"
|
||||
117,"Queens","Hammels/Arverne","Boro Zone"
|
||||
118,"Staten Island","Heartland Village/Todt Hill","Boro Zone"
|
||||
119,"Bronx","Highbridge","Boro Zone"
|
||||
120,"Manhattan","Highbridge Park","Boro Zone"
|
||||
121,"Queens","Hillcrest/Pomonok","Boro Zone"
|
||||
122,"Queens","Hollis","Boro Zone"
|
||||
123,"Brooklyn","Homecrest","Boro Zone"
|
||||
124,"Queens","Howard Beach","Boro Zone"
|
||||
125,"Manhattan","Hudson Sq","Yellow Zone"
|
||||
126,"Bronx","Hunts Point","Boro Zone"
|
||||
127,"Manhattan","Inwood","Boro Zone"
|
||||
128,"Manhattan","Inwood Hill Park","Boro Zone"
|
||||
129,"Queens","Jackson Heights","Boro Zone"
|
||||
130,"Queens","Jamaica","Boro Zone"
|
||||
131,"Queens","Jamaica Estates","Boro Zone"
|
||||
132,"Queens","JFK Airport","Airports"
|
||||
133,"Brooklyn","Kensington","Boro Zone"
|
||||
134,"Queens","Kew Gardens","Boro Zone"
|
||||
135,"Queens","Kew Gardens Hills","Boro Zone"
|
||||
136,"Bronx","Kingsbridge Heights","Boro Zone"
|
||||
137,"Manhattan","Kips Bay","Yellow Zone"
|
||||
138,"Queens","LaGuardia Airport","Airports"
|
||||
139,"Queens","Laurelton","Boro Zone"
|
||||
140,"Manhattan","Lenox Hill East","Yellow Zone"
|
||||
141,"Manhattan","Lenox Hill West","Yellow Zone"
|
||||
142,"Manhattan","Lincoln Square East","Yellow Zone"
|
||||
143,"Manhattan","Lincoln Square West","Yellow Zone"
|
||||
144,"Manhattan","Little Italy/NoLiTa","Yellow Zone"
|
||||
145,"Queens","Long Island City/Hunters Point","Boro Zone"
|
||||
146,"Queens","Long Island City/Queens Plaza","Boro Zone"
|
||||
147,"Bronx","Longwood","Boro Zone"
|
||||
148,"Manhattan","Lower East Side","Yellow Zone"
|
||||
149,"Brooklyn","Madison","Boro Zone"
|
||||
150,"Brooklyn","Manhattan Beach","Boro Zone"
|
||||
151,"Manhattan","Manhattan Valley","Yellow Zone"
|
||||
152,"Manhattan","Manhattanville","Boro Zone"
|
||||
153,"Manhattan","Marble Hill","Boro Zone"
|
||||
154,"Brooklyn","Marine Park/Floyd Bennett Field","Boro Zone"
|
||||
155,"Brooklyn","Marine Park/Mill Basin","Boro Zone"
|
||||
156,"Staten Island","Mariners Harbor","Boro Zone"
|
||||
157,"Queens","Maspeth","Boro Zone"
|
||||
158,"Manhattan","Meatpacking/West Village West","Yellow Zone"
|
||||
159,"Bronx","Melrose South","Boro Zone"
|
||||
160,"Queens","Middle Village","Boro Zone"
|
||||
161,"Manhattan","Midtown Center","Yellow Zone"
|
||||
162,"Manhattan","Midtown East","Yellow Zone"
|
||||
163,"Manhattan","Midtown North","Yellow Zone"
|
||||
164,"Manhattan","Midtown South","Yellow Zone"
|
||||
165,"Brooklyn","Midwood","Boro Zone"
|
||||
166,"Manhattan","Morningside Heights","Boro Zone"
|
||||
167,"Bronx","Morrisania/Melrose","Boro Zone"
|
||||
168,"Bronx","Mott Haven/Port Morris","Boro Zone"
|
||||
169,"Bronx","Mount Hope","Boro Zone"
|
||||
170,"Manhattan","Murray Hill","Yellow Zone"
|
||||
171,"Queens","Murray Hill-Queens","Boro Zone"
|
||||
172,"Staten Island","New Dorp/Midland Beach","Boro Zone"
|
||||
173,"Queens","North Corona","Boro Zone"
|
||||
174,"Bronx","Norwood","Boro Zone"
|
||||
175,"Queens","Oakland Gardens","Boro Zone"
|
||||
176,"Staten Island","Oakwood","Boro Zone"
|
||||
177,"Brooklyn","Ocean Hill","Boro Zone"
|
||||
178,"Brooklyn","Ocean Parkway South","Boro Zone"
|
||||
179,"Queens","Old Astoria","Boro Zone"
|
||||
180,"Queens","Ozone Park","Boro Zone"
|
||||
181,"Brooklyn","Park Slope","Boro Zone"
|
||||
182,"Bronx","Parkchester","Boro Zone"
|
||||
183,"Bronx","Pelham Bay","Boro Zone"
|
||||
184,"Bronx","Pelham Bay Park","Boro Zone"
|
||||
185,"Bronx","Pelham Parkway","Boro Zone"
|
||||
186,"Manhattan","Penn Station/Madison Sq West","Yellow Zone"
|
||||
187,"Staten Island","Port Richmond","Boro Zone"
|
||||
188,"Brooklyn","Prospect-Lefferts Gardens","Boro Zone"
|
||||
189,"Brooklyn","Prospect Heights","Boro Zone"
|
||||
190,"Brooklyn","Prospect Park","Boro Zone"
|
||||
191,"Queens","Queens Village","Boro Zone"
|
||||
192,"Queens","Queensboro Hill","Boro Zone"
|
||||
193,"Queens","Queensbridge/Ravenswood","Boro Zone"
|
||||
194,"Manhattan","Randalls Island","Yellow Zone"
|
||||
195,"Brooklyn","Red Hook","Boro Zone"
|
||||
196,"Queens","Rego Park","Boro Zone"
|
||||
197,"Queens","Richmond Hill","Boro Zone"
|
||||
198,"Queens","Ridgewood","Boro Zone"
|
||||
199,"Bronx","Rikers Island","Boro Zone"
|
||||
200,"Bronx","Riverdale/North Riverdale/Fieldston","Boro Zone"
|
||||
201,"Queens","Rockaway Park","Boro Zone"
|
||||
202,"Manhattan","Roosevelt Island","Boro Zone"
|
||||
203,"Queens","Rosedale","Boro Zone"
|
||||
204,"Staten Island","Rossville/Woodrow","Boro Zone"
|
||||
205,"Queens","Saint Albans","Boro Zone"
|
||||
206,"Staten Island","Saint George/New Brighton","Boro Zone"
|
||||
207,"Queens","Saint Michaels Cemetery/Woodside","Boro Zone"
|
||||
208,"Bronx","Schuylerville/Edgewater Park","Boro Zone"
|
||||
209,"Manhattan","Seaport","Yellow Zone"
|
||||
210,"Brooklyn","Sheepshead Bay","Boro Zone"
|
||||
211,"Manhattan","SoHo","Yellow Zone"
|
||||
212,"Bronx","Soundview/Bruckner","Boro Zone"
|
||||
213,"Bronx","Soundview/Castle Hill","Boro Zone"
|
||||
214,"Staten Island","South Beach/Dongan Hills","Boro Zone"
|
||||
215,"Queens","South Jamaica","Boro Zone"
|
||||
216,"Queens","South Ozone Park","Boro Zone"
|
||||
217,"Brooklyn","South Williamsburg","Boro Zone"
|
||||
218,"Queens","Springfield Gardens North","Boro Zone"
|
||||
219,"Queens","Springfield Gardens South","Boro Zone"
|
||||
220,"Bronx","Spuyten Duyvil/Kingsbridge","Boro Zone"
|
||||
221,"Staten Island","Stapleton","Boro Zone"
|
||||
222,"Brooklyn","Starrett City","Boro Zone"
|
||||
223,"Queens","Steinway","Boro Zone"
|
||||
224,"Manhattan","Stuy Town/Peter Cooper Village","Yellow Zone"
|
||||
225,"Brooklyn","Stuyvesant Heights","Boro Zone"
|
||||
226,"Queens","Sunnyside","Boro Zone"
|
||||
227,"Brooklyn","Sunset Park East","Boro Zone"
|
||||
228,"Brooklyn","Sunset Park West","Boro Zone"
|
||||
229,"Manhattan","Sutton Place/Turtle Bay North","Yellow Zone"
|
||||
230,"Manhattan","Times Sq/Theatre District","Yellow Zone"
|
||||
231,"Manhattan","TriBeCa/Civic Center","Yellow Zone"
|
||||
232,"Manhattan","Two Bridges/Seward Park","Yellow Zone"
|
||||
233,"Manhattan","UN/Turtle Bay South","Yellow Zone"
|
||||
234,"Manhattan","Union Sq","Yellow Zone"
|
||||
235,"Bronx","University Heights/Morris Heights","Boro Zone"
|
||||
236,"Manhattan","Upper East Side North","Yellow Zone"
|
||||
237,"Manhattan","Upper East Side South","Yellow Zone"
|
||||
238,"Manhattan","Upper West Side North","Yellow Zone"
|
||||
239,"Manhattan","Upper West Side South","Yellow Zone"
|
||||
240,"Bronx","Van Cortlandt Park","Boro Zone"
|
||||
241,"Bronx","Van Cortlandt Village","Boro Zone"
|
||||
242,"Bronx","Van Nest/Morris Park","Boro Zone"
|
||||
243,"Manhattan","Washington Heights North","Boro Zone"
|
||||
244,"Manhattan","Washington Heights South","Boro Zone"
|
||||
245,"Staten Island","West Brighton","Boro Zone"
|
||||
246,"Manhattan","West Chelsea/Hudson Yards","Yellow Zone"
|
||||
247,"Bronx","West Concourse","Boro Zone"
|
||||
248,"Bronx","West Farms/Bronx River","Boro Zone"
|
||||
249,"Manhattan","West Village","Yellow Zone"
|
||||
250,"Bronx","Westchester Village/Unionport","Boro Zone"
|
||||
251,"Staten Island","Westerleigh","Boro Zone"
|
||||
252,"Queens","Whitestone","Boro Zone"
|
||||
253,"Queens","Willets Point","Boro Zone"
|
||||
254,"Bronx","Williamsbridge/Olinville","Boro Zone"
|
||||
255,"Brooklyn","Williamsburg (North Side)","Boro Zone"
|
||||
256,"Brooklyn","Williamsburg (South Side)","Boro Zone"
|
||||
257,"Brooklyn","Windsor Terrace","Boro Zone"
|
||||
258,"Queens","Woodhaven","Boro Zone"
|
||||
259,"Bronx","Woodlawn/Wakefield","Boro Zone"
|
||||
260,"Queens","Woodside","Boro Zone"
|
||||
261,"Manhattan","World Trade Center","Yellow Zone"
|
||||
262,"Manhattan","Yorkville East","Yellow Zone"
|
||||
263,"Manhattan","Yorkville West","Yellow Zone"
|
||||
264,"Unknown","NV","N/A"
|
||||
265,"Unknown","NA","N/A"
|
||||
"locationid","borough","zone","service_zone"
|
||||
1,"EWR","Newark Airport","EWR"
|
||||
2,"Queens","Jamaica Bay","Boro Zone"
|
||||
3,"Bronx","Allerton/Pelham Gardens","Boro Zone"
|
||||
4,"Manhattan","Alphabet City","Yellow Zone"
|
||||
5,"Staten Island","Arden Heights","Boro Zone"
|
||||
6,"Staten Island","Arrochar/Fort Wadsworth","Boro Zone"
|
||||
7,"Queens","Astoria","Boro Zone"
|
||||
8,"Queens","Astoria Park","Boro Zone"
|
||||
9,"Queens","Auburndale","Boro Zone"
|
||||
10,"Queens","Baisley Park","Boro Zone"
|
||||
11,"Brooklyn","Bath Beach","Boro Zone"
|
||||
12,"Manhattan","Battery Park","Yellow Zone"
|
||||
13,"Manhattan","Battery Park City","Yellow Zone"
|
||||
14,"Brooklyn","Bay Ridge","Boro Zone"
|
||||
15,"Queens","Bay Terrace/Fort Totten","Boro Zone"
|
||||
16,"Queens","Bayside","Boro Zone"
|
||||
17,"Brooklyn","Bedford","Boro Zone"
|
||||
18,"Bronx","Bedford Park","Boro Zone"
|
||||
19,"Queens","Bellerose","Boro Zone"
|
||||
20,"Bronx","Belmont","Boro Zone"
|
||||
21,"Brooklyn","Bensonhurst East","Boro Zone"
|
||||
22,"Brooklyn","Bensonhurst West","Boro Zone"
|
||||
23,"Staten Island","Bloomfield/Emerson Hill","Boro Zone"
|
||||
24,"Manhattan","Bloomingdale","Yellow Zone"
|
||||
25,"Brooklyn","Boerum Hill","Boro Zone"
|
||||
26,"Brooklyn","Borough Park","Boro Zone"
|
||||
27,"Queens","Breezy Point/Fort Tilden/Riis Beach","Boro Zone"
|
||||
28,"Queens","Briarwood/Jamaica Hills","Boro Zone"
|
||||
29,"Brooklyn","Brighton Beach","Boro Zone"
|
||||
30,"Queens","Broad Channel","Boro Zone"
|
||||
31,"Bronx","Bronx Park","Boro Zone"
|
||||
32,"Bronx","Bronxdale","Boro Zone"
|
||||
33,"Brooklyn","Brooklyn Heights","Boro Zone"
|
||||
34,"Brooklyn","Brooklyn Navy Yard","Boro Zone"
|
||||
35,"Brooklyn","Brownsville","Boro Zone"
|
||||
36,"Brooklyn","Bushwick North","Boro Zone"
|
||||
37,"Brooklyn","Bushwick South","Boro Zone"
|
||||
38,"Queens","Cambria Heights","Boro Zone"
|
||||
39,"Brooklyn","Canarsie","Boro Zone"
|
||||
40,"Brooklyn","Carroll Gardens","Boro Zone"
|
||||
41,"Manhattan","Central Harlem","Boro Zone"
|
||||
42,"Manhattan","Central Harlem North","Boro Zone"
|
||||
43,"Manhattan","Central Park","Yellow Zone"
|
||||
44,"Staten Island","Charleston/Tottenville","Boro Zone"
|
||||
45,"Manhattan","Chinatown","Yellow Zone"
|
||||
46,"Bronx","City Island","Boro Zone"
|
||||
47,"Bronx","Claremont/Bathgate","Boro Zone"
|
||||
48,"Manhattan","Clinton East","Yellow Zone"
|
||||
49,"Brooklyn","Clinton Hill","Boro Zone"
|
||||
50,"Manhattan","Clinton West","Yellow Zone"
|
||||
51,"Bronx","Co-Op City","Boro Zone"
|
||||
52,"Brooklyn","Cobble Hill","Boro Zone"
|
||||
53,"Queens","College Point","Boro Zone"
|
||||
54,"Brooklyn","Columbia Street","Boro Zone"
|
||||
55,"Brooklyn","Coney Island","Boro Zone"
|
||||
56,"Queens","Corona","Boro Zone"
|
||||
57,"Queens","Corona","Boro Zone"
|
||||
58,"Bronx","Country Club","Boro Zone"
|
||||
59,"Bronx","Crotona Park","Boro Zone"
|
||||
60,"Bronx","Crotona Park East","Boro Zone"
|
||||
61,"Brooklyn","Crown Heights North","Boro Zone"
|
||||
62,"Brooklyn","Crown Heights South","Boro Zone"
|
||||
63,"Brooklyn","Cypress Hills","Boro Zone"
|
||||
64,"Queens","Douglaston","Boro Zone"
|
||||
65,"Brooklyn","Downtown Brooklyn/MetroTech","Boro Zone"
|
||||
66,"Brooklyn","DUMBO/Vinegar Hill","Boro Zone"
|
||||
67,"Brooklyn","Dyker Heights","Boro Zone"
|
||||
68,"Manhattan","East Chelsea","Yellow Zone"
|
||||
69,"Bronx","East Concourse/Concourse Village","Boro Zone"
|
||||
70,"Queens","East Elmhurst","Boro Zone"
|
||||
71,"Brooklyn","East Flatbush/Farragut","Boro Zone"
|
||||
72,"Brooklyn","East Flatbush/Remsen Village","Boro Zone"
|
||||
73,"Queens","East Flushing","Boro Zone"
|
||||
74,"Manhattan","East Harlem North","Boro Zone"
|
||||
75,"Manhattan","East Harlem South","Boro Zone"
|
||||
76,"Brooklyn","East New York","Boro Zone"
|
||||
77,"Brooklyn","East New York/Pennsylvania Avenue","Boro Zone"
|
||||
78,"Bronx","East Tremont","Boro Zone"
|
||||
79,"Manhattan","East Village","Yellow Zone"
|
||||
80,"Brooklyn","East Williamsburg","Boro Zone"
|
||||
81,"Bronx","Eastchester","Boro Zone"
|
||||
82,"Queens","Elmhurst","Boro Zone"
|
||||
83,"Queens","Elmhurst/Maspeth","Boro Zone"
|
||||
84,"Staten Island","Eltingville/Annadale/Prince's Bay","Boro Zone"
|
||||
85,"Brooklyn","Erasmus","Boro Zone"
|
||||
86,"Queens","Far Rockaway","Boro Zone"
|
||||
87,"Manhattan","Financial District North","Yellow Zone"
|
||||
88,"Manhattan","Financial District South","Yellow Zone"
|
||||
89,"Brooklyn","Flatbush/Ditmas Park","Boro Zone"
|
||||
90,"Manhattan","Flatiron","Yellow Zone"
|
||||
91,"Brooklyn","Flatlands","Boro Zone"
|
||||
92,"Queens","Flushing","Boro Zone"
|
||||
93,"Queens","Flushing Meadows-Corona Park","Boro Zone"
|
||||
94,"Bronx","Fordham South","Boro Zone"
|
||||
95,"Queens","Forest Hills","Boro Zone"
|
||||
96,"Queens","Forest Park/Highland Park","Boro Zone"
|
||||
97,"Brooklyn","Fort Greene","Boro Zone"
|
||||
98,"Queens","Fresh Meadows","Boro Zone"
|
||||
99,"Staten Island","Freshkills Park","Boro Zone"
|
||||
100,"Manhattan","Garment District","Yellow Zone"
|
||||
101,"Queens","Glen Oaks","Boro Zone"
|
||||
102,"Queens","Glendale","Boro Zone"
|
||||
103,"Manhattan","Governor's Island/Ellis Island/Liberty Island","Yellow Zone"
|
||||
104,"Manhattan","Governor's Island/Ellis Island/Liberty Island","Yellow Zone"
|
||||
105,"Manhattan","Governor's Island/Ellis Island/Liberty Island","Yellow Zone"
|
||||
106,"Brooklyn","Gowanus","Boro Zone"
|
||||
107,"Manhattan","Gramercy","Yellow Zone"
|
||||
108,"Brooklyn","Gravesend","Boro Zone"
|
||||
109,"Staten Island","Great Kills","Boro Zone"
|
||||
110,"Staten Island","Great Kills Park","Boro Zone"
|
||||
111,"Brooklyn","Green-Wood Cemetery","Boro Zone"
|
||||
112,"Brooklyn","Greenpoint","Boro Zone"
|
||||
113,"Manhattan","Greenwich Village North","Yellow Zone"
|
||||
114,"Manhattan","Greenwich Village South","Yellow Zone"
|
||||
115,"Staten Island","Grymes Hill/Clifton","Boro Zone"
|
||||
116,"Manhattan","Hamilton Heights","Boro Zone"
|
||||
117,"Queens","Hammels/Arverne","Boro Zone"
|
||||
118,"Staten Island","Heartland Village/Todt Hill","Boro Zone"
|
||||
119,"Bronx","Highbridge","Boro Zone"
|
||||
120,"Manhattan","Highbridge Park","Boro Zone"
|
||||
121,"Queens","Hillcrest/Pomonok","Boro Zone"
|
||||
122,"Queens","Hollis","Boro Zone"
|
||||
123,"Brooklyn","Homecrest","Boro Zone"
|
||||
124,"Queens","Howard Beach","Boro Zone"
|
||||
125,"Manhattan","Hudson Sq","Yellow Zone"
|
||||
126,"Bronx","Hunts Point","Boro Zone"
|
||||
127,"Manhattan","Inwood","Boro Zone"
|
||||
128,"Manhattan","Inwood Hill Park","Boro Zone"
|
||||
129,"Queens","Jackson Heights","Boro Zone"
|
||||
130,"Queens","Jamaica","Boro Zone"
|
||||
131,"Queens","Jamaica Estates","Boro Zone"
|
||||
132,"Queens","JFK Airport","Airports"
|
||||
133,"Brooklyn","Kensington","Boro Zone"
|
||||
134,"Queens","Kew Gardens","Boro Zone"
|
||||
135,"Queens","Kew Gardens Hills","Boro Zone"
|
||||
136,"Bronx","Kingsbridge Heights","Boro Zone"
|
||||
137,"Manhattan","Kips Bay","Yellow Zone"
|
||||
138,"Queens","LaGuardia Airport","Airports"
|
||||
139,"Queens","Laurelton","Boro Zone"
|
||||
140,"Manhattan","Lenox Hill East","Yellow Zone"
|
||||
141,"Manhattan","Lenox Hill West","Yellow Zone"
|
||||
142,"Manhattan","Lincoln Square East","Yellow Zone"
|
||||
143,"Manhattan","Lincoln Square West","Yellow Zone"
|
||||
144,"Manhattan","Little Italy/NoLiTa","Yellow Zone"
|
||||
145,"Queens","Long Island City/Hunters Point","Boro Zone"
|
||||
146,"Queens","Long Island City/Queens Plaza","Boro Zone"
|
||||
147,"Bronx","Longwood","Boro Zone"
|
||||
148,"Manhattan","Lower East Side","Yellow Zone"
|
||||
149,"Brooklyn","Madison","Boro Zone"
|
||||
150,"Brooklyn","Manhattan Beach","Boro Zone"
|
||||
151,"Manhattan","Manhattan Valley","Yellow Zone"
|
||||
152,"Manhattan","Manhattanville","Boro Zone"
|
||||
153,"Manhattan","Marble Hill","Boro Zone"
|
||||
154,"Brooklyn","Marine Park/Floyd Bennett Field","Boro Zone"
|
||||
155,"Brooklyn","Marine Park/Mill Basin","Boro Zone"
|
||||
156,"Staten Island","Mariners Harbor","Boro Zone"
|
||||
157,"Queens","Maspeth","Boro Zone"
|
||||
158,"Manhattan","Meatpacking/West Village West","Yellow Zone"
|
||||
159,"Bronx","Melrose South","Boro Zone"
|
||||
160,"Queens","Middle Village","Boro Zone"
|
||||
161,"Manhattan","Midtown Center","Yellow Zone"
|
||||
162,"Manhattan","Midtown East","Yellow Zone"
|
||||
163,"Manhattan","Midtown North","Yellow Zone"
|
||||
164,"Manhattan","Midtown South","Yellow Zone"
|
||||
165,"Brooklyn","Midwood","Boro Zone"
|
||||
166,"Manhattan","Morningside Heights","Boro Zone"
|
||||
167,"Bronx","Morrisania/Melrose","Boro Zone"
|
||||
168,"Bronx","Mott Haven/Port Morris","Boro Zone"
|
||||
169,"Bronx","Mount Hope","Boro Zone"
|
||||
170,"Manhattan","Murray Hill","Yellow Zone"
|
||||
171,"Queens","Murray Hill-Queens","Boro Zone"
|
||||
172,"Staten Island","New Dorp/Midland Beach","Boro Zone"
|
||||
173,"Queens","North Corona","Boro Zone"
|
||||
174,"Bronx","Norwood","Boro Zone"
|
||||
175,"Queens","Oakland Gardens","Boro Zone"
|
||||
176,"Staten Island","Oakwood","Boro Zone"
|
||||
177,"Brooklyn","Ocean Hill","Boro Zone"
|
||||
178,"Brooklyn","Ocean Parkway South","Boro Zone"
|
||||
179,"Queens","Old Astoria","Boro Zone"
|
||||
180,"Queens","Ozone Park","Boro Zone"
|
||||
181,"Brooklyn","Park Slope","Boro Zone"
|
||||
182,"Bronx","Parkchester","Boro Zone"
|
||||
183,"Bronx","Pelham Bay","Boro Zone"
|
||||
184,"Bronx","Pelham Bay Park","Boro Zone"
|
||||
185,"Bronx","Pelham Parkway","Boro Zone"
|
||||
186,"Manhattan","Penn Station/Madison Sq West","Yellow Zone"
|
||||
187,"Staten Island","Port Richmond","Boro Zone"
|
||||
188,"Brooklyn","Prospect-Lefferts Gardens","Boro Zone"
|
||||
189,"Brooklyn","Prospect Heights","Boro Zone"
|
||||
190,"Brooklyn","Prospect Park","Boro Zone"
|
||||
191,"Queens","Queens Village","Boro Zone"
|
||||
192,"Queens","Queensboro Hill","Boro Zone"
|
||||
193,"Queens","Queensbridge/Ravenswood","Boro Zone"
|
||||
194,"Manhattan","Randalls Island","Yellow Zone"
|
||||
195,"Brooklyn","Red Hook","Boro Zone"
|
||||
196,"Queens","Rego Park","Boro Zone"
|
||||
197,"Queens","Richmond Hill","Boro Zone"
|
||||
198,"Queens","Ridgewood","Boro Zone"
|
||||
199,"Bronx","Rikers Island","Boro Zone"
|
||||
200,"Bronx","Riverdale/North Riverdale/Fieldston","Boro Zone"
|
||||
201,"Queens","Rockaway Park","Boro Zone"
|
||||
202,"Manhattan","Roosevelt Island","Boro Zone"
|
||||
203,"Queens","Rosedale","Boro Zone"
|
||||
204,"Staten Island","Rossville/Woodrow","Boro Zone"
|
||||
205,"Queens","Saint Albans","Boro Zone"
|
||||
206,"Staten Island","Saint George/New Brighton","Boro Zone"
|
||||
207,"Queens","Saint Michaels Cemetery/Woodside","Boro Zone"
|
||||
208,"Bronx","Schuylerville/Edgewater Park","Boro Zone"
|
||||
209,"Manhattan","Seaport","Yellow Zone"
|
||||
210,"Brooklyn","Sheepshead Bay","Boro Zone"
|
||||
211,"Manhattan","SoHo","Yellow Zone"
|
||||
212,"Bronx","Soundview/Bruckner","Boro Zone"
|
||||
213,"Bronx","Soundview/Castle Hill","Boro Zone"
|
||||
214,"Staten Island","South Beach/Dongan Hills","Boro Zone"
|
||||
215,"Queens","South Jamaica","Boro Zone"
|
||||
216,"Queens","South Ozone Park","Boro Zone"
|
||||
217,"Brooklyn","South Williamsburg","Boro Zone"
|
||||
218,"Queens","Springfield Gardens North","Boro Zone"
|
||||
219,"Queens","Springfield Gardens South","Boro Zone"
|
||||
220,"Bronx","Spuyten Duyvil/Kingsbridge","Boro Zone"
|
||||
221,"Staten Island","Stapleton","Boro Zone"
|
||||
222,"Brooklyn","Starrett City","Boro Zone"
|
||||
223,"Queens","Steinway","Boro Zone"
|
||||
224,"Manhattan","Stuy Town/Peter Cooper Village","Yellow Zone"
|
||||
225,"Brooklyn","Stuyvesant Heights","Boro Zone"
|
||||
226,"Queens","Sunnyside","Boro Zone"
|
||||
227,"Brooklyn","Sunset Park East","Boro Zone"
|
||||
228,"Brooklyn","Sunset Park West","Boro Zone"
|
||||
229,"Manhattan","Sutton Place/Turtle Bay North","Yellow Zone"
|
||||
230,"Manhattan","Times Sq/Theatre District","Yellow Zone"
|
||||
231,"Manhattan","TriBeCa/Civic Center","Yellow Zone"
|
||||
232,"Manhattan","Two Bridges/Seward Park","Yellow Zone"
|
||||
233,"Manhattan","UN/Turtle Bay South","Yellow Zone"
|
||||
234,"Manhattan","Union Sq","Yellow Zone"
|
||||
235,"Bronx","University Heights/Morris Heights","Boro Zone"
|
||||
236,"Manhattan","Upper East Side North","Yellow Zone"
|
||||
237,"Manhattan","Upper East Side South","Yellow Zone"
|
||||
238,"Manhattan","Upper West Side North","Yellow Zone"
|
||||
239,"Manhattan","Upper West Side South","Yellow Zone"
|
||||
240,"Bronx","Van Cortlandt Park","Boro Zone"
|
||||
241,"Bronx","Van Cortlandt Village","Boro Zone"
|
||||
242,"Bronx","Van Nest/Morris Park","Boro Zone"
|
||||
243,"Manhattan","Washington Heights North","Boro Zone"
|
||||
244,"Manhattan","Washington Heights South","Boro Zone"
|
||||
245,"Staten Island","West Brighton","Boro Zone"
|
||||
246,"Manhattan","West Chelsea/Hudson Yards","Yellow Zone"
|
||||
247,"Bronx","West Concourse","Boro Zone"
|
||||
248,"Bronx","West Farms/Bronx River","Boro Zone"
|
||||
249,"Manhattan","West Village","Yellow Zone"
|
||||
250,"Bronx","Westchester Village/Unionport","Boro Zone"
|
||||
251,"Staten Island","Westerleigh","Boro Zone"
|
||||
252,"Queens","Whitestone","Boro Zone"
|
||||
253,"Queens","Willets Point","Boro Zone"
|
||||
254,"Bronx","Williamsbridge/Olinville","Boro Zone"
|
||||
255,"Brooklyn","Williamsburg (North Side)","Boro Zone"
|
||||
256,"Brooklyn","Williamsburg (South Side)","Boro Zone"
|
||||
257,"Brooklyn","Windsor Terrace","Boro Zone"
|
||||
258,"Queens","Woodhaven","Boro Zone"
|
||||
259,"Bronx","Woodlawn/Wakefield","Boro Zone"
|
||||
260,"Queens","Woodside","Boro Zone"
|
||||
261,"Manhattan","World Trade Center","Yellow Zone"
|
||||
262,"Manhattan","Yorkville East","Yellow Zone"
|
||||
263,"Manhattan","Yorkville West","Yellow Zone"
|
||||
264,"Unknown","NV","N/A"
|
||||
265,"Unknown","NA","N/A"
|
||||
|
122
05-batch/README.md
Normal file
122
05-batch/README.md
Normal file
@ -0,0 +1,122 @@
|
||||
# Module 5: Batch Processing
|
||||
|
||||
## 5.1 Introduction
|
||||
|
||||
* :movie_camera: 5.1.1 Introduction to Batch Processing
|
||||
|
||||
[](https://youtu.be/dcHe5Fl3MF8&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=51)
|
||||
|
||||
* :movie_camera: 5.1.2 Introduction to Spark
|
||||
|
||||
[](https://youtu.be/FhaqbEOuQ8U&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=52)
|
||||
|
||||
|
||||
## 5.2 Installation
|
||||
|
||||
Follow [these intructions](setup/) to install Spark:
|
||||
|
||||
* [Windows](setup/windows.md)
|
||||
* [Linux](setup/linux.md)
|
||||
* [MacOS](setup/macos.md)
|
||||
|
||||
And follow [this](setup/pyspark.md) to run PySpark in Jupyter
|
||||
|
||||
* :movie_camera: 5.2.1 (Optional) Installing Spark (Linux)
|
||||
|
||||
[](https://youtu.be/hqUbB9c8sKg&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=53)
|
||||
|
||||
Alternatively, if the setups above don't work, you can run Spark in Google Colab.
|
||||
> [!NOTE]
|
||||
> It's advisable to invest some time in setting things up locally rather than immediately jumping into this solution
|
||||
|
||||
* [Google Colab Instructions](https://medium.com/gitconnected/launch-spark-on-google-colab-and-connect-to-sparkui-342cad19b304)
|
||||
* [Google Colab Starter Notebook](https://github.com/aaalexlit/medium_articles/blob/main/Spark_in_Colab.ipynb)
|
||||
|
||||
|
||||
## 5.3 Spark SQL and DataFrames
|
||||
|
||||
* :movie_camera: 5.3.1 First Look at Spark/PySpark
|
||||
|
||||
[](https://youtu.be/r_Sf6fCB40c&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=54)
|
||||
|
||||
* :movie_camera: 5.3.2 Spark Dataframes
|
||||
|
||||
[](https://youtu.be/ti3aC1m3rE8&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=55)
|
||||
|
||||
* :movie_camera: 5.3.3 (Optional) Preparing Yellow and Green Taxi Data
|
||||
|
||||
[](https://youtu.be/CI3P4tAtru4&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=56)
|
||||
|
||||
Script to prepare the Dataset [download_data.sh](code/download_data.sh)
|
||||
|
||||
> [!NOTE]
|
||||
> The other way to infer the schema (apart from pandas) for the csv files, is to set the `inferSchema` option to `true` while reading the files in Spark.
|
||||
|
||||
* :movie_camera: 5.3.4 SQL with Spark
|
||||
|
||||
[](https://youtu.be/uAlp2VuZZPY&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=57)
|
||||
|
||||
|
||||
## 5.4 Spark Internals
|
||||
|
||||
* :movie_camera: 5.4.1 Anatomy of a Spark Cluster
|
||||
|
||||
[](https://youtu.be/68CipcZt7ZA&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=58)
|
||||
|
||||
* :movie_camera: 5.4.2 GroupBy in Spark
|
||||
|
||||
[](https://youtu.be/9qrDsY_2COo&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=59)
|
||||
|
||||
* :movie_camera: 5.4.3 Joins in Spark
|
||||
|
||||
[](https://youtu.be/lu7TrqAWuH4&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=60)
|
||||
|
||||
## 5.5 (Optional) Resilient Distributed Datasets
|
||||
|
||||
* :movie_camera: 5.5.1 Operations on Spark RDDs
|
||||
|
||||
[](https://youtu.be/Bdu-xIrF3OM&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=61)
|
||||
|
||||
* :movie_camera: 5.5.2 Spark RDD mapPartition
|
||||
|
||||
[](https://youtu.be/k3uB2K99roI&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=62)
|
||||
|
||||
|
||||
## 5.6 Running Spark in the Cloud
|
||||
|
||||
* :movie_camera: 5.6.1 Connecting to Google Cloud Storage
|
||||
|
||||
[](https://youtu.be/Yyz293hBVcQ&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=63)
|
||||
|
||||
* :movie_camera: 5.6.2 Creating a Local Spark Cluster
|
||||
|
||||
[](https://youtu.be/HXBwSlXo5IA&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=64)
|
||||
|
||||
* :movie_camera: 5.6.3 Setting up a Dataproc Cluster
|
||||
|
||||
[](https://youtu.be/osAiAYahvh8&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=65)
|
||||
|
||||
* :movie_camera: 5.6.4 Connecting Spark to Big Query
|
||||
|
||||
[](https://youtu.be/HIm2BOj8C0Q&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=66)
|
||||
|
||||
|
||||
# Homework
|
||||
|
||||
* [2024 Homework](../cohorts/2024/05-batch/homework.md)
|
||||
|
||||
|
||||
# Community notes
|
||||
|
||||
Did you take notes? You can share them here.
|
||||
|
||||
* [Notes by Alvaro Navas](https://github.com/ziritrion/dataeng-zoomcamp/blob/main/notes/5_batch_processing.md)
|
||||
* [Sandy's DE Learning Blog](https://learningdataengineering540969211.wordpress.com/2022/02/24/week-5-de-zoomcamp-5-2-1-installing-spark-on-linux/)
|
||||
* [Notes by Alain Boisvert](https://github.com/boisalai/de-zoomcamp-2023/blob/main/week5.md)
|
||||
* [Alternative : Using docker-compose to launch spark by rafik](https://gist.github.com/rafik-rahoui/f98df941c4ccced9c46e9ccbdef63a03)
|
||||
* [Marcos Torregrosa's blog (spanish)](https://www.n4gash.com/2023/data-engineering-zoomcamp-semana-5-batch-spark)
|
||||
* [Notes by Victor Padilha](https://github.com/padilha/de-zoomcamp/tree/master/week5)
|
||||
* [Notes by Oscar Garcia](https://github.com/ozkary/Data-Engineering-Bootcamp/tree/main/Step5-Batch-Processing)
|
||||
* [Notes by HongWei](https://github.com/hwchua0209/data-engineering-zoomcamp-submission/blob/main/05-batch-processing/README.md)
|
||||
* [2024 videos transcript](https://drive.google.com/drive/folders/1XMmP4H5AMm1qCfMFxc_hqaPGw31KIVcb?usp=drive_link) by Maria Fisher
|
||||
* Add your notes here (above this line)
|
||||
@ -65,7 +65,17 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!wget https://nyc-tlc.s3.amazonaws.com/trip+data/fhvhv_tripdata_2021-01.csv"
|
||||
"!wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhvhv/fhvhv_tripdata_2021-01.csv.gz"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "201a5957",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!gzip -dc fhvhv_tripdata_2021-01.csv.gz"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -501,25 +511,25 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"hvfhs_license_num,dispatching_base_num,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,SR_Flag\r",
|
||||
"hvfhs_license_num,dispatching_base_num,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,SR_Flag\r\n",
|
||||
"\r\n",
|
||||
"HV0003,B02682,2021-01-01 00:33:44,2021-01-01 00:49:07,230,166,\r",
|
||||
"HV0003,B02682,2021-01-01 00:33:44,2021-01-01 00:49:07,230,166,\r\n",
|
||||
"\r\n",
|
||||
"HV0003,B02682,2021-01-01 00:55:19,2021-01-01 01:18:21,152,167,\r",
|
||||
"HV0003,B02682,2021-01-01 00:55:19,2021-01-01 01:18:21,152,167,\r\n",
|
||||
"\r\n",
|
||||
"HV0003,B02764,2021-01-01 00:23:56,2021-01-01 00:38:05,233,142,\r",
|
||||
"HV0003,B02764,2021-01-01 00:23:56,2021-01-01 00:38:05,233,142,\r\n",
|
||||
"\r\n",
|
||||
"HV0003,B02764,2021-01-01 00:42:51,2021-01-01 00:45:50,142,143,\r",
|
||||
"HV0003,B02764,2021-01-01 00:42:51,2021-01-01 00:45:50,142,143,\r\n",
|
||||
"\r\n",
|
||||
"HV0003,B02764,2021-01-01 00:48:14,2021-01-01 01:08:42,143,78,\r",
|
||||
"HV0003,B02764,2021-01-01 00:48:14,2021-01-01 01:08:42,143,78,\r\n",
|
||||
"\r\n",
|
||||
"HV0005,B02510,2021-01-01 00:06:59,2021-01-01 00:43:01,88,42,\r",
|
||||
"HV0005,B02510,2021-01-01 00:06:59,2021-01-01 00:43:01,88,42,\r\n",
|
||||
"\r\n",
|
||||
"HV0005,B02510,2021-01-01 00:50:00,2021-01-01 01:04:57,42,151,\r",
|
||||
"HV0005,B02510,2021-01-01 00:50:00,2021-01-01 01:04:57,42,151,\r\n",
|
||||
"\r\n",
|
||||
"HV0003,B02764,2021-01-01 00:14:30,2021-01-01 00:50:27,71,226,\r",
|
||||
"HV0003,B02764,2021-01-01 00:14:30,2021-01-01 00:50:27,71,226,\r\n",
|
||||
"\r\n",
|
||||
"HV0003,B02875,2021-01-01 00:22:54,2021-01-01 00:30:20,112,255,\r",
|
||||
"HV0003,B02875,2021-01-01 00:22:54,2021-01-01 00:30:20,112,255,\r\n",
|
||||
"\r\n"
|
||||
]
|
||||
}
|
||||
@ -57,8 +57,7 @@ rm openjdk-11.0.2_linux-x64_bin.tar.gz
|
||||
Download Spark. Use 3.3.2 version:
|
||||
|
||||
```bash
|
||||
wget https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
|
||||
|
||||
wget https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
|
||||
```
|
||||
|
||||
Unpack:
|
||||
@ -10,7 +10,7 @@ for other MacOS versions as well
|
||||
Ensure Brew and Java installed in your system:
|
||||
|
||||
```bash
|
||||
xcode-select –install
|
||||
xcode-select --install
|
||||
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install.sh)"
|
||||
brew install java
|
||||
```
|
||||
@ -24,12 +24,37 @@ export PATH="$JAVA_HOME/bin/:$PATH"
|
||||
|
||||
Make sure Java was installed to `/usr/local/Cellar/openjdk@11/11.0.12`: Open Finder > Press Cmd+Shift+G > paste "/usr/local/Cellar/openjdk@11/11.0.12". If you can't find it, then change the path location to appropriate path on your machine. You can also run `brew info java` to check where java was installed on your machine.
|
||||
|
||||
### Anaconda-based spark set up
|
||||
if you are having anaconda setup, you can skip the spark installation and instead Pyspark package to run the spark.
|
||||
With Anaconda and Mac we can spark set by first installing pyspark and then for environment variable set up findspark
|
||||
|
||||
Open Anaconda Activate the environment where you want to apply these changes
|
||||
|
||||
Run pyspark and install it as a package in this environment <br>
|
||||
Run findspark and install it as a package in this environment
|
||||
|
||||
Ensure that open JDK is already set up. This allows us to not have to install Spark separately and manually set up the environment Also with this we may have to use Jupyter Lab (instead of Jupyter Notebook) to open a Jupyter notebook for running the programs.
|
||||
Once the Spark is set up start the conda environment and open Jupyter Lab.
|
||||
Run the program below in notebook to check everything is running fine.
|
||||
```
|
||||
import pyspark
|
||||
from pyspark.sql import SparkSession
|
||||
|
||||
!spark-shell --version
|
||||
|
||||
# Create SparkSession
|
||||
spark = SparkSession.builder.master("local[1]") \
|
||||
.appName('test-spark') \
|
||||
.getOrCreate()
|
||||
|
||||
print(f'The PySpark {spark.version} version is running...')
|
||||
```
|
||||
### Installing Spark
|
||||
|
||||
1. Install Scala
|
||||
|
||||
```bash
|
||||
brew install scala@2.11
|
||||
brew install scala@2.13
|
||||
```
|
||||
|
||||
2. Install Apache Spark
|
||||
@ -64,3 +89,4 @@ distData.filter(_ < 10).collect()
|
||||
It's the same for all platforms. Go to [pyspark.md](pyspark.md).
|
||||
|
||||
|
||||
|
||||
@ -20,13 +20,21 @@ For example, if the file under `${SPARK_HOME}/python/lib/` is `py4j-0.10.9.3-src
|
||||
export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.3-src.zip:$PYTHONPATH"
|
||||
```
|
||||
|
||||
On Windows, you may have to do path conversion from unix-style to windowns-style:
|
||||
|
||||
```bash
|
||||
SPARK_WIN=`cygpath -w ${SPARK_HOME}`
|
||||
|
||||
export PYTHONPATH="${SPARK_WIN}\\python\\"
|
||||
export PYTHONPATH="${SPARK_WIN}\\python\\lib\\py4j-0.10.9-src.zip;$PYTHONPATH"
|
||||
```
|
||||
|
||||
Now you can run Jupyter or IPython to test if things work. Go to some other directory, e.g. `~/tmp`.
|
||||
|
||||
Download a CSV file that we'll use for testing:
|
||||
|
||||
```bash
|
||||
wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv
|
||||
wget https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv
|
||||
```
|
||||
|
||||
Now let's run `ipython` (or `jupyter notebook`) and execute:
|
||||
@ -42,7 +50,7 @@ spark = SparkSession.builder \
|
||||
|
||||
df = spark.read \
|
||||
.option("header", "true") \
|
||||
.csv('taxi+_zone_lookup.csv')
|
||||
.csv('taxi_zone_lookup.csv')
|
||||
|
||||
df.show()
|
||||
```
|
||||
@ -56,6 +56,19 @@ for FILE in ${FILES}; do
|
||||
done
|
||||
```
|
||||
|
||||
If you don't have wget, you can use curl:
|
||||
|
||||
```bash
|
||||
HADOOP_VERSION="3.2.0"
|
||||
PREFIX="https://raw.githubusercontent.com/cdarlint/winutils/master/hadoop-${HADOOP_VERSION}/bin/"
|
||||
|
||||
FILES="hadoop.dll hadoop.exp hadoop.lib hadoop.pdb libwinutils.lib winutils.exe winutils.pdb"
|
||||
|
||||
for FILE in ${FILES}; do
|
||||
curl -o "${FILE}" "${PREFIX}/${FILE}";
|
||||
done
|
||||
```
|
||||
|
||||
Add it to `PATH`:
|
||||
|
||||
```bash
|
||||
@ -68,7 +81,7 @@ export PATH="${HADOOP_HOME}/bin:${PATH}"
|
||||
Now download Spark. Select version 3.3.2
|
||||
|
||||
```bash
|
||||
wget https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
|
||||
wget https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
|
||||
```
|
||||
|
||||
|
||||
131
06-streaming/README.md
Normal file
131
06-streaming/README.md
Normal file
@ -0,0 +1,131 @@
|
||||
# Module 6: Stream Processing
|
||||
|
||||
# Code structure
|
||||
* [Java examples](java)
|
||||
* [Python examples](python)
|
||||
* [KSQLD examples](ksqldb)
|
||||
|
||||
## Confluent cloud setup
|
||||
Confluent cloud provides a free 30 days trial for, you can signup [here](https://www.confluent.io/confluent-cloud/tryfree/)
|
||||
|
||||
## Introduction to Stream Processing
|
||||
|
||||
- [Slides](https://docs.google.com/presentation/d/1bCtdCba8v1HxJ_uMm9pwjRUC-NAMeB-6nOG2ng3KujA/edit?usp=sharing)
|
||||
|
||||
- :movie_camera: 6.0.1 Introduction
|
||||
|
||||
[](https://youtu.be/hfvju3iOIP0&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=67)
|
||||
|
||||
- :movie_camera: 6.0.2 What is stream processing
|
||||
|
||||
[](https://youtu.be/WxTxKGcfA-k&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=68)
|
||||
|
||||
## Introduction to Kafka
|
||||
|
||||
- :movie_camera: 6.3 What is kafka?
|
||||
|
||||
[](https://youtu.be/zPLZUDPi4AY&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=69)
|
||||
|
||||
- :movie_camera: 6.4 Confluent cloud
|
||||
|
||||
[](https://youtu.be/ZnEZFEYKppw&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=70)
|
||||
|
||||
- :movie_camera: 6.5 Kafka producer consumer
|
||||
|
||||
[](https://youtu.be/aegTuyxX7Yg&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=71)
|
||||
|
||||
## Kafka Configuration
|
||||
|
||||
- :movie_camera: 6.6 Kafka configuration
|
||||
|
||||
[](https://youtu.be/SXQtWyRpMKs&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=72)
|
||||
|
||||
- [Kafka Configuration Reference](https://docs.confluent.io/platform/current/installation/configuration/)
|
||||
|
||||
## Kafka Streams
|
||||
|
||||
- [Slides](https://docs.google.com/presentation/d/1fVi9sFa7fL2ZW3ynS5MAZm0bRSZ4jO10fymPmrfTUjE/edit?usp=sharing)
|
||||
|
||||
- [Streams Concepts](https://docs.confluent.io/platform/current/streams/concepts.html)
|
||||
|
||||
- :movie_camera: 6.7 Kafka streams basics
|
||||
|
||||
[](https://youtu.be/dUyA_63eRb0&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=73)
|
||||
|
||||
- :movie_camera: 6.8 Kafka stream join
|
||||
|
||||
[](https://youtu.be/NcpKlujh34Y&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=74)
|
||||
|
||||
- :movie_camera: 6.9 Kafka stream testing
|
||||
|
||||
[](https://youtu.be/TNx5rmLY8Pk&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=75)
|
||||
|
||||
- :movie_camera: 6.10 Kafka stream windowing
|
||||
|
||||
[](https://youtu.be/r1OuLdwxbRc&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=76)
|
||||
|
||||
- :movie_camera: 6.11 Kafka ksqldb & Connect
|
||||
|
||||
[](https://youtu.be/DziQ4a4tn9Y&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=77)
|
||||
|
||||
- :movie_camera: 6.12 Kafka Schema registry
|
||||
|
||||
[](https://youtu.be/tBY_hBuyzwI&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=78)
|
||||
|
||||
## Faust - Python Stream Processing
|
||||
|
||||
- [Faust Documentation](https://faust.readthedocs.io/en/latest/index.html)
|
||||
- [Faust vs Kafka Streams](https://faust.readthedocs.io/en/latest/playbooks/vskafka.html)
|
||||
|
||||
## Pyspark - Structured Streaming
|
||||
Please follow the steps described under [pyspark-streaming](python/streams-example/pyspark/README.md)
|
||||
|
||||
- :movie_camera: 6.13 Kafka Streaming with Python
|
||||
|
||||
[](https://youtu.be/BgAlVknDFlQ&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=79)
|
||||
|
||||
- :movie_camera: 6.14 Pyspark Structured Streaming
|
||||
|
||||
[](https://youtu.be/VIVr7KwRQmE&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=80)
|
||||
|
||||
## Kafka Streams with JVM library
|
||||
|
||||
- [Confluent Kafka Streams](https://kafka.apache.org/documentation/streams/)
|
||||
- [Scala Example](https://github.com/AnkushKhanna/kafka-helper/tree/master/src/main/scala/kafka/schematest)
|
||||
|
||||
## KSQL and ksqlDB
|
||||
|
||||
- [Introducing KSQL: Streaming SQL for Apache Kafka](https://www.confluent.io/blog/ksql-streaming-sql-for-apache-kafka/)
|
||||
- [ksqlDB](https://ksqldb.io/)
|
||||
|
||||
## Kafka Connect
|
||||
|
||||
- [Making Sense of Stream Data](https://medium.com/analytics-vidhya/making-sense-of-stream-data-b74c1252a8f5)
|
||||
|
||||
## Docker
|
||||
|
||||
### Starting cluster
|
||||
|
||||
## Command line for Kafka
|
||||
|
||||
### Create topic
|
||||
|
||||
```bash
|
||||
./bin/kafka-topics.sh --create --topic demo_1 --bootstrap-server localhost:9092 --partitions 2
|
||||
```
|
||||
|
||||
## Homework
|
||||
|
||||
* [2024 Homework](../cohorts/2024/06-streaming/homework.md)
|
||||
|
||||
## Community notes
|
||||
|
||||
Did you take notes? You can share them here.
|
||||
|
||||
* [Notes by Alvaro Navas](https://github.com/ziritrion/dataeng-zoomcamp/blob/main/notes/6_streaming.md )
|
||||
* [Marcos Torregrosa's blog (spanish)](https://www.n4gash.com/2023/data-engineering-zoomcamp-semana-6-stream-processing/)
|
||||
* [Notes by Oscar Garcia](https://github.com/ozkary/Data-Engineering-Bootcamp/tree/main/Step6-Streaming)
|
||||
* [2024 videos transcript](https://drive.google.com/drive/folders/1UngeL5FM-GcDLM7QYaDTKb3jIS6CQC14?usp=drive_link) by Maria Fisher
|
||||
* [Notes by Shayan Shafiee Moghadam](https://github.com/shayansm2/eng-notebook/blob/main/kafka/readme.md)
|
||||
* Add your notes here (above this line)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user