Compare commits
60 Commits
stream-hom
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| a6045b3ddb | |||
| e6fd4c16d2 | |||
| ded05d789c | |||
| c0b7d74647 | |||
| a65e6de49c | |||
| 11d5096f6e | |||
| 4fbe5ebe43 | |||
| 020af9c5fa | |||
| 196d307de7 | |||
| 8a3cc88f5e | |||
| f66f0ff62b | |||
| 092e24eb6c | |||
| 7e13f11a2d | |||
| 1e7c141cb7 | |||
| fe4500b0ca | |||
| e830b57a4c | |||
| 0281175f7c | |||
| f24e9dc1d8 | |||
| 8daeb91a92 | |||
| 1fb6720a93 | |||
| 3d41139216 | |||
| d5a4567208 | |||
| 1739740eba | |||
| cee8c30227 | |||
| 8a09faf5a5 | |||
| 59254b4f66 | |||
| 93a5f99aea | |||
| beb77c92b9 | |||
| 5a6cedd21b | |||
| cd322fb154 | |||
| 380eafa8d1 | |||
| 6aa4a58420 | |||
| f64aa8339b | |||
| 568f47eccd | |||
| 57edfa075f | |||
| 6f35486ac6 | |||
| 7fb9aa7c5b | |||
| f0ad0f2c75 | |||
| 3f062330c7 | |||
| 37e37b3c9c | |||
| be1fe3f071 | |||
| 8f67b3358d | |||
| 20f14e5a5c | |||
| 191d9fe23d | |||
| eea47ecfe5 | |||
| 4345a33d7f | |||
| 5f27b7ceb5 | |||
| b8da62cb88 | |||
| c1d6fde336 | |||
| 3ad1730500 | |||
| 6bdf7883a5 | |||
| 6b2e40d70a | |||
| ed96de4b49 | |||
| 4746212309 | |||
| 8c0bb0b43e | |||
| 1ad93cd381 | |||
| 2096b9e2a1 | |||
| beeb9e6454 | |||
| af646fa483 | |||
| 71d5e47ea0 |
@ -1,93 +0,0 @@
|
||||
# See here for image contents: https://github.com/microsoft/vscode-dev-containers/tree/v0.177.0/containers/go/.devcontainer/base.Dockerfile
|
||||
|
||||
# [Choice] Go version (use -bullseye variants on local arm64/Apple Silicon): 1, 1.16, 1.17, 1-bullseye, 1.16-bullseye, 1.17-bullseye, 1-buster, 1.16-buster, 1.17-buster
|
||||
ARG VARIANT=1-bullseye
|
||||
FROM mcr.microsoft.com/vscode/devcontainers/go:0-${VARIANT}
|
||||
|
||||
# [Choice] Node.js version: none, lts/*, 16, 14, 12, 10
|
||||
ARG NODE_VERSION="none"
|
||||
RUN if [ "${NODE_VERSION}" != "none" ]; then su vscode -c "umask 0002 && . /usr/local/share/nvm/nvm.sh && nvm install ${NODE_VERSION} 2>&1"; fi
|
||||
|
||||
# Install powershell
|
||||
ARG PS_VERSION="7.2.1"
|
||||
# powershell-7.3.0-linux-x64.tar.gz
|
||||
# powershell-7.3.0-linux-arm64.tar.gz
|
||||
RUN ARCH="$(dpkg --print-architecture)"; \
|
||||
if [ "${ARCH}" = "amd64" ]; then \
|
||||
PS_BIN="v$PS_VERSION/powershell-$PS_VERSION-linux-x64.tar.gz"; \
|
||||
elif [ "${ARCH}" = "arm64" ]; then \
|
||||
PS_BIN="v$PS_VERSION/powershell-$PS_VERSION-linux-arm64.tar.gz"; \
|
||||
elif [ "${ARCH}" = "armhf" ]; then \
|
||||
PS_BIN="v$PS_VERSION/powershell-$PS_VERSION-linux-arm32.tar.gz"; \
|
||||
fi; \
|
||||
wget https://github.com/PowerShell/PowerShell/releases/download/$PS_BIN -O pwsh.tar.gz; \
|
||||
mkdir /usr/local/pwsh && \
|
||||
tar Cxvfz /usr/local/pwsh pwsh.tar.gz && \
|
||||
rm pwsh.tar.gz
|
||||
|
||||
ENV PATH=$PATH:/usr/local/pwsh
|
||||
|
||||
RUN echo 'deb http://download.opensuse.org/repositories/shells:/fish:/release:/3/Debian_11/ /' | tee /etc/apt/sources.list.d/shells:fish:release:3.list; \
|
||||
curl -fsSL https://download.opensuse.org/repositories/shells:fish:release:3/Debian_11/Release.key | gpg --dearmor | tee /etc/apt/trusted.gpg.d/shells_fish_release_3.gpg > /dev/null; \
|
||||
apt-get update && export DEBIAN_FRONTEND=noninteractive \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
fish \
|
||||
tmux \
|
||||
fzf \
|
||||
&& apt-get clean
|
||||
|
||||
ARG USERNAME=vscode
|
||||
|
||||
# Download the oh-my-posh binary
|
||||
RUN mkdir /home/${USERNAME}/bin; \
|
||||
wget https://github.com/JanDeDobbeleer/oh-my-posh/releases/latest/download/posh-linux-$(dpkg --print-architecture) -O /home/${USERNAME}/bin/oh-my-posh; \
|
||||
chmod +x /home/${USERNAME}/bin/oh-my-posh; \
|
||||
chown ${USERNAME}: /home/${USERNAME}/bin;
|
||||
|
||||
# NOTE: devcontainers are Linux-only at this time but when
|
||||
# Windows or Darwin is supported someone will need to improve
|
||||
# the code logic above.
|
||||
|
||||
# Setup a neat little PowerShell experience
|
||||
RUN pwsh -Command Install-Module posh-git -Scope AllUsers -Force; \
|
||||
pwsh -Command Install-Module z -Scope AllUsers -Force; \
|
||||
pwsh -Command Install-Module PSFzf -Scope AllUsers -Force; \
|
||||
pwsh -Command Install-Module Terminal-Icons -Scope AllUsers -Force;
|
||||
|
||||
# add the oh-my-posh path to the PATH variable
|
||||
ENV PATH "$PATH:/home/${USERNAME}/bin"
|
||||
|
||||
# Can be used to override the devcontainer prompt default theme:
|
||||
ENV POSH_THEME="https://raw.githubusercontent.com/JanDeDobbeleer/oh-my-posh/main/themes/clean-detailed.omp.json"
|
||||
|
||||
# Deploy oh-my-posh prompt to Powershell:
|
||||
COPY Microsoft.PowerShell_profile.ps1 /home/${USERNAME}/.config/powershell/Microsoft.PowerShell_profile.ps1
|
||||
|
||||
# Deploy oh-my-posh prompt to Fish:
|
||||
COPY config.fish /home/${USERNAME}/.config/fish/config.fish
|
||||
|
||||
# Everything runs as root during build time, so we want
|
||||
# to make sure the vscode user can edit these paths too:
|
||||
RUN chmod 777 -R /home/${USERNAME}/.config
|
||||
|
||||
# Override vscode's own Bash prompt with oh-my-posh:
|
||||
RUN sed -i 's/^__bash_prompt$/#&/' /home/${USERNAME}/.bashrc && \
|
||||
echo "eval \"\$(oh-my-posh init bash --config $POSH_THEME)\"" >> /home/${USERNAME}/.bashrc
|
||||
|
||||
# Override vscode's own ZSH prompt with oh-my-posh:
|
||||
RUN echo "eval \"\$(oh-my-posh init zsh --config $POSH_THEME)\"" >> /home/${USERNAME}/.zshrc
|
||||
|
||||
# Set container timezone:
|
||||
ARG TZ="UTC"
|
||||
RUN ln -sf /usr/share/zoneinfo/${TZ} /etc/localtime
|
||||
|
||||
# Required for Python - Confluent Kafka on M1 Silicon
|
||||
RUN apt update && apt -y install software-properties-common gcc
|
||||
RUN git clone https://github.com/edenhill/librdkafka
|
||||
RUN cd librdkafka && ./configure && make && make install && ldconfig
|
||||
|
||||
# [Optional] Uncomment the next line to use go get to install anything else you need
|
||||
# RUN go get -x github.com/JanDeDobbeleer/battery
|
||||
|
||||
# [Optional] Uncomment this line to install global node packages.
|
||||
# RUN su vscode -c "source /usr/local/share/nvm/nvm.sh && npm install -g <your-package-here>" 2>&1
|
||||
@ -1,14 +0,0 @@
|
||||
Import-Module posh-git
|
||||
Import-Module PSFzf -ArgumentList 'Ctrl+t', 'Ctrl+r'
|
||||
Import-Module z
|
||||
Import-Module Terminal-Icons
|
||||
|
||||
Set-PSReadlineKeyHandler -Key Tab -Function MenuComplete
|
||||
|
||||
$env:POSH_GIT_ENABLED=$true
|
||||
oh-my-posh init pwsh --config $env:POSH_THEME | Invoke-Expression
|
||||
|
||||
# NOTE: You can override the above env var from the devcontainer.json "args" under the "build" key.
|
||||
|
||||
# Aliases
|
||||
Set-Alias -Name ac -Value Add-Content
|
||||
@ -1,58 +0,0 @@
|
||||
# Devcontainer for DataTalksClub Data Engineering Zoomcamp
|
||||
This devcontainer sets up a development environment for this class. This can be used with both VS Code and GitHub Codespaces.
|
||||
|
||||
## Getting Started
|
||||
To continue, make sure you have [Visual Studio Code](https://code.visualstudio.com/) and [Docker Desktop](https://www.docker.com/products/docker-desktop/) installed OR use [GitHub Codespaces](https://github.com/features/codespaces).
|
||||
|
||||
**Option 1: Local VS Code**
|
||||
|
||||
1. Clone the repo and connect to it in VS Code:
|
||||
|
||||
```bash
|
||||
$ cd your/desired/repo/location
|
||||
$ git clone https://github.com/DataTalksClub/data-engineering-zoomcamp.git
|
||||
```
|
||||
|
||||
1. Download the [`Dev Containers`](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) extension from the VS Code marketplace. Full docs on devcontainers [here](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers)
|
||||
|
||||
2. Press Cmd + Shift + P (Mac) or Ctrl + Shift + P (Windows) to open the Command Pallette. Type in `Dev Containers: Open Folder in Container` and select the repo directory
|
||||
|
||||
3. Wait for the container to build and the dependencies to install
|
||||
|
||||
**Option 2: GitHub Codespaces**
|
||||
|
||||
1. Fork this repo
|
||||
|
||||
2. From the repo page in GitHub, select the green `<> Code` button and choose Codespaces
|
||||
|
||||
3. Click `Create Codespace on Main`, or checkout a branch if you prefer
|
||||
|
||||
4. Wait for the container to build and the dependencies to install
|
||||
|
||||
5. Start developing!
|
||||
|
||||
|
||||
## Included Tools and Languages:
|
||||
|
||||
* `Python 3.9`
|
||||
- `Pandas`
|
||||
- `SQLAlchemy`
|
||||
- `PySpark`
|
||||
- `PyArrow`
|
||||
- `Polars`
|
||||
- `Prefect 2.7.7` and all required Python dependencies
|
||||
- `confluent-kafka`
|
||||
* `Google Cloud SDK`
|
||||
* `dbt-core`
|
||||
- `dbt-postgres`
|
||||
- `dbt-bigquery`
|
||||
* `Terraform`
|
||||
* `Jupyter Notebooks for VS Code`
|
||||
* `Docker`
|
||||
* `Spark`
|
||||
* `JDK` version 11
|
||||
* [`Oh-My-Posh Powershell themes`](https://github.com/JanDeDobbeleer/oh-my-posh)
|
||||
* Popular VS Code themes (GitHub, Atom One, Material Icons etc.)
|
||||
|
||||
## Customization
|
||||
Feel free to modify the `Dockerfile`, `devcontainer.json` or `requirements.txt` file to include any other tools or packages that you need for your development environment. In the Dockerfile, you can customize the `POSH_THEME` environment variable with a theme of your choosing from [here](https://ohmyposh.dev/docs/themes)
|
||||
@ -1,4 +0,0 @@
|
||||
# Activate oh-my-posh prompt:
|
||||
oh-my-posh init fish --config $POSH_THEME | source
|
||||
|
||||
# NOTE: You can override the above env vars from the devcontainer.json "args" under the "build" key.
|
||||
@ -1,117 +0,0 @@
|
||||
// For format details, see https://aka.ms/devcontainer.json. For config options, see the README at:
|
||||
// https://github.com/microsoft/vscode-dev-containers/tree/v0.177.0/containers/go
|
||||
{
|
||||
"name": "oh-my-posh",
|
||||
"build": {
|
||||
"dockerfile": "Dockerfile",
|
||||
"args": {
|
||||
// Update the VARIANT arg to pick a version of Go: 1, 1.16, 1.17
|
||||
// Append -bullseye or -buster to pin to an OS version.
|
||||
// Use -bullseye variants on local arm64/Apple Silicon.
|
||||
"VARIANT": "1.19-bullseye",
|
||||
// Options:
|
||||
|
||||
"POSH_THEME": "https://raw.githubusercontent.com/JanDeDobbeleer/oh-my-posh/main/themes/clean-detailed.omp.json",
|
||||
|
||||
// Override me with your own timezone:
|
||||
"TZ": "America/Moncton",
|
||||
// Use one of the "TZ database name" entries from:
|
||||
// https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
|
||||
|
||||
"NODE_VERSION": "lts/*",
|
||||
//Powershell version
|
||||
"PS_VERSION": "7.2.7"
|
||||
}
|
||||
},
|
||||
"runArgs": ["--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined"],
|
||||
|
||||
"features": {
|
||||
"ghcr.io/devcontainers/features/azure-cli:1": {
|
||||
"version": "latest"
|
||||
},
|
||||
"ghcr.io/devcontainers/features/python:1": {
|
||||
"version": "3.9"
|
||||
},
|
||||
"ghcr.io/devcontainers-contrib/features/curl-apt-get:1": {},
|
||||
"ghcr.io/devcontainers-contrib/features/terraform-asdf:2": {},
|
||||
"ghcr.io/devcontainers-contrib/features/yamllint:2": {},
|
||||
"ghcr.io/devcontainers/features/docker-in-docker:2": {},
|
||||
"ghcr.io/devcontainers/features/docker-outside-of-docker:1": {},
|
||||
"ghcr.io/devcontainers/features/github-cli:1": {},
|
||||
"ghcr.io/devcontainers-contrib/features/spark-sdkman:2": {
|
||||
"jdkVersion": "11"
|
||||
},
|
||||
"ghcr.io/dhoeric/features/google-cloud-cli:1": {
|
||||
"version": "latest"
|
||||
}
|
||||
},
|
||||
|
||||
// Set *default* container specific settings.json values on container create.
|
||||
"customizations": {
|
||||
"vscode": {
|
||||
"settings": {
|
||||
"go.toolsManagement.checkForUpdates": "local",
|
||||
"go.useLanguageServer": true,
|
||||
"go.gopath": "/go",
|
||||
"go.goroot": "/usr/local/go",
|
||||
"terminal.integrated.profiles.linux": {
|
||||
"bash": {
|
||||
"path": "bash"
|
||||
},
|
||||
"zsh": {
|
||||
"path": "zsh"
|
||||
},
|
||||
"fish": {
|
||||
"path": "fish"
|
||||
},
|
||||
"tmux": {
|
||||
"path": "tmux",
|
||||
"icon": "terminal-tmux"
|
||||
},
|
||||
"pwsh": {
|
||||
"path": "pwsh",
|
||||
"icon": "terminal-powershell"
|
||||
}
|
||||
},
|
||||
"terminal.integrated.defaultProfile.linux": "pwsh",
|
||||
"terminal.integrated.defaultProfile.windows": "pwsh",
|
||||
"terminal.integrated.defaultProfile.osx": "pwsh",
|
||||
"tasks.statusbar.default.hide": true,
|
||||
"terminal.integrated.tabs.defaultIcon": "terminal-powershell",
|
||||
"terminal.integrated.tabs.defaultColor": "terminal.ansiBlue",
|
||||
"workbench.colorTheme": "GitHub Dark Dimmed",
|
||||
"workbench.iconTheme": "material-icon-theme"
|
||||
},
|
||||
|
||||
// Add the IDs of extensions you want installed when the container is created.
|
||||
"extensions": [
|
||||
"actboy168.tasks",
|
||||
"eamodio.gitlens",
|
||||
"davidanson.vscode-markdownlint",
|
||||
"editorconfig.editorconfig",
|
||||
"esbenp.prettier-vscode",
|
||||
"github.vscode-pull-request-github",
|
||||
"golang.go",
|
||||
"ms-vscode.powershell",
|
||||
"redhat.vscode-yaml",
|
||||
"yzhang.markdown-all-in-one",
|
||||
"ms-python.python",
|
||||
"ms-python.vscode-pylance",
|
||||
"ms-toolsai.jupyter",
|
||||
"akamud.vscode-theme-onedark",
|
||||
"ms-vscode-remote.remote-containers",
|
||||
"PKief.material-icon-theme",
|
||||
"GitHub.github-vscode-theme"
|
||||
]
|
||||
}
|
||||
},
|
||||
|
||||
// Use 'forwardPorts' to make a list of ports inside the container available locally.
|
||||
// "forwardPorts": [3000],
|
||||
|
||||
// Use 'postCreateCommand' to run commands after the container is created.
|
||||
"postCreateCommand": "pip3 install --user -r .devcontainer/requirements.txt --use-pep517",
|
||||
|
||||
// Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
|
||||
"remoteUser": "vscode"
|
||||
}
|
||||
@ -1,16 +0,0 @@
|
||||
pandas==1.5.2
|
||||
prefect==2.7.7
|
||||
prefect-sqlalchemy==0.2.2
|
||||
prefect-gcp[cloud_storage]==0.2.4
|
||||
protobuf
|
||||
pyarrow==10.0.1
|
||||
pandas-gbq==0.18.1
|
||||
psycopg2-binary==2.9.5
|
||||
sqlalchemy==1.4.46
|
||||
ipykernel
|
||||
polars
|
||||
dbt-core
|
||||
dbt-bigquery
|
||||
dbt-postgres
|
||||
pyspark
|
||||
confluent-kafka==1.9.2
|
||||
@ -208,4 +208,8 @@ Did you take notes? You can share them here
|
||||
* [2024 Module-01 Environment setup video by ellacharmed on youtube](https://youtu.be/Zce_Hd37NGs)
|
||||
* [Docker Notes by Linda](https://github.com/inner-outer-space/de-zoomcamp-2024/blob/main/1a-docker_sql/readme.md) • [Terraform Notes by Linda](https://github.com/inner-outer-space/de-zoomcamp-2024/blob/main/1b-terraform_gcp/readme.md)
|
||||
* [Notes from Hammad Tariq](https://github.com/hamad-tariq/HammadTariq-ZoomCamp2024/blob/9c8b4908416eb8cade3d7ec220e7664c003e9b11/week_1_basics_n_setup/README.md)
|
||||
* [Hung's Notes](https://hung.bearblog.dev/docker/) & [Docker Cheatsheet](https://github.com/HangenYuu/docker-cheatsheet)
|
||||
* [Kemal's Notes](https://github.com/kemaldahha/data-engineering-course/blob/main/week_1_notes.md)
|
||||
* [Notes from Manuel Guerra (Windows+WSL2 Environment)](https://github.com/ManuelGuerra1987/data-engineering-zoomcamp-notes/blob/main/1_Containerization-and-Infrastructure-as-Code/README.md)
|
||||
* [Notes from Horeb SEIDOU](https://www.notion.so/Week-1-Containerization-and-Infrastructure-as-Code-15729780dc4a80a08288e497ba937a37?pvs=4)
|
||||
* Add your notes above this line
|
||||
|
||||
@ -1,191 +1,306 @@
|
||||
> [!NOTE]
|
||||
>If you're looking for Airflow videos from the 2022 edition, check the [2022 cohort folder](../cohorts/2022/week_2_data_ingestion/).
|
||||
>
|
||||
>If you're looking for Prefect videos from the 2023 edition, check the [2023 cohort folder](../cohorts/2023/week_2_data_ingestion/).
|
||||
|
||||
# Week 2: Workflow Orchestration
|
||||
|
||||
Welcome to Week 2 of the Data Engineering Zoomcamp! 🚀😤 This week, we'll be covering workflow orchestration with Mage.
|
||||
Welcome to Week 2 of the Data Engineering Zoomcamp! This week, we’ll dive into workflow orchestration using [Kestra](https://go.kestra.io/de-zoomcamp/github).
|
||||
|
||||
Mage is an open-source, hybrid framework for transforming and integrating data. ✨
|
||||
Kestra is an open-source, event-driven orchestration platform that simplifies building both scheduled and event-driven workflows. By adopting Infrastructure as Code practices for data and process orchestration, Kestra enables you to build reliable workflows with just a few lines of YAML.
|
||||
|
||||
This week, you'll learn how to use the Mage platform to author and share _magical_ data pipelines. This will all be covered in the course, but if you'd like to learn a bit more about Mage, check out our docs [here](https://docs.mage.ai/introduction/overview).
|
||||
> [!NOTE]
|
||||
>You can find all videos for this week in this [YouTube Playlist](https://go.kestra.io/de-zoomcamp/yt-playlist).
|
||||
|
||||
* [2.2.1 - 📯 Intro to Orchestration](#221----intro-to-orchestration)
|
||||
* [2.2.2 - 🧙♂️ Intro to Mage](#222---%EF%B8%8F-intro-to-mage)
|
||||
* [2.2.3 - 🐘 ETL: API to Postgres](#223----etl-api-to-postgres)
|
||||
* [2.2.4 - 🤓 ETL: API to GCS](#224----etl-api-to-gcs)
|
||||
* [2.2.5 - 🔍 ETL: GCS to BigQuery](#225----etl-gcs-to-bigquery)
|
||||
* [2.2.6 - 👨💻 Parameterized Execution](#226----parameterized-execution)
|
||||
* [2.2.7 - 🤖 Deployment (Optional)](#227----deployment-optional)
|
||||
* [2.2.8 - 🗒️ Homework](#228---️-homework)
|
||||
* [2.2.9 - 👣 Next Steps](#229----next-steps)
|
||||
---
|
||||
|
||||
## 📕 Course Resources
|
||||
# Course Structure
|
||||
|
||||
### 2.2.1 - 📯 Intro to Orchestration
|
||||
## 1. Conceptual Material: Introduction to Orchestration and Kestra
|
||||
|
||||
In this section, we'll cover the basics of workflow orchestration. We'll discuss what it is, why it's important, and how it can be used to build data pipelines.
|
||||
In this section, you’ll learn the foundations of workflow orchestration, its importance, and how Kestra fits into the orchestration landscape.
|
||||
|
||||
Videos
|
||||
- 2.2.1a - What is Orchestration?
|
||||
### Videos
|
||||
- **2.2.1 - Introduction to Workflow Orchestration**
|
||||
[](https://youtu.be/Np6QmmcgLCs)
|
||||
|
||||
[](https://youtu.be/Li8-MWHhTbo&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=17)
|
||||
- **2.2.2 - Learn the Concepts of Kestra**
|
||||
[](https://youtu.be/o79n-EVpics)
|
||||
|
||||
### Resources
|
||||
- [Quickstart Guide](https://go.kestra.io/de-zoomcamp/quickstart)
|
||||
- [Install Kestra with Docker Compose](https://go.kestra.io/de-zoomcamp/docker-compose)
|
||||
- [Tutorial](https://go.kestra.io/de-zoomcamp/tutorial)
|
||||
- [What is an Orchestrator?](https://go.kestra.io/de-zoomcamp/what-is-an-orchestrator)
|
||||
|
||||
---
|
||||
|
||||
## 2. Hands-On Coding Project: Build Data Pipelines with Kestra
|
||||
|
||||
This week, we're gonna build ETL pipelines for Yellow and Green Taxi data from NYC’s Taxi and Limousine Commission (TLC). You will:
|
||||
1. Extract data from [CSV files](https://github.com/DataTalksClub/nyc-tlc-data/releases).
|
||||
2. Load it into Postgres or Google Cloud (GCS + BigQuery).
|
||||
3. Explore scheduling and backfilling workflows.
|
||||
|
||||
### File Structure
|
||||
|
||||
The project is organized as follows:
|
||||
```
|
||||
.
|
||||
├── flows/
|
||||
│ ├── 01_getting_started_data_pipeline.yaml
|
||||
│ ├── 02_postgres_taxi.yaml
|
||||
│ ├── 02_postgres_taxi_scheduled.yaml
|
||||
│ ├── 03_postgres_dbt.yaml
|
||||
│ ├── 04_gcp_kv.yaml
|
||||
│ ├── 05_gcp_setup.yaml
|
||||
│ ├── 06_gcp_taxi.yaml
|
||||
│ ├── 06_gcp_taxi_scheduled.yaml
|
||||
│ └── 07_gcp_dbt.yaml
|
||||
```
|
||||
|
||||
### Setup Kestra
|
||||
|
||||
We'll set up Kestra using Docker Compose containing one container for the Kestra server and another for the Postgres database:
|
||||
|
||||
```bash
|
||||
cd 02-workflow-orchestration/
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
Once the container starts, you can access the Kestra UI at [http://localhost:8080](http://localhost:8080).
|
||||
|
||||
If you prefer to add flows programmatically using Kestra's API, run the following commands:
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8080/api/v1/flows/import -F fileUpload=@flows/01_getting_started_data_pipeline.yaml
|
||||
curl -X POST http://localhost:8080/api/v1/flows/import -F fileUpload=@flows/02_postgres_taxi.yaml
|
||||
curl -X POST http://localhost:8080/api/v1/flows/import -F fileUpload=@flows/02_postgres_taxi_scheduled.yaml
|
||||
curl -X POST http://localhost:8080/api/v1/flows/import -F fileUpload=@flows/03_postgres_dbt.yaml
|
||||
curl -X POST http://localhost:8080/api/v1/flows/import -F fileUpload=@flows/04_gcp_kv.yaml
|
||||
curl -X POST http://localhost:8080/api/v1/flows/import -F fileUpload=@flows/05_gcp_setup.yaml
|
||||
curl -X POST http://localhost:8080/api/v1/flows/import -F fileUpload=@flows/06_gcp_taxi.yaml
|
||||
curl -X POST http://localhost:8080/api/v1/flows/import -F fileUpload=@flows/06_gcp_taxi_scheduled.yaml
|
||||
curl -X POST http://localhost:8080/api/v1/flows/import -F fileUpload=@flows/07_gcp_dbt.yaml
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. ETL Pipelines in Kestra: Detailed Walkthrough
|
||||
|
||||
### Getting Started Pipeline
|
||||
|
||||
This introductory flow is added just to demonstrate a simple data pipeline which extracts data via HTTP REST API, transforms that data in Python and then queries it using DuckDB.
|
||||
|
||||
### Videos
|
||||
|
||||
- **2.2.3 - Create an ETL Pipeline with Postgres in Kestra**
|
||||
[](https://youtu.be/OkfLX28Ecjg?si=vKbIyWo1TtjpNnvt)
|
||||
- **2.2.4 - Manage Scheduling and Backfills using Postgres in Kestra**
|
||||
[](https://youtu.be/_-li_z97zog?si=G6jZbkfJb3GAyqrd)
|
||||
- **2.2.5 - Transform Data with dbt and Postgres in Kestra**
|
||||
[](https://youtu.be/ZLp2N6p2JjE?si=tWhcvq5w4lO8v1_p)
|
||||
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
Extract[Extract Data via HTTP REST API] --> Transform[Transform Data in Python]
|
||||
Transform --> Query[Query Data with DuckDB]
|
||||
```
|
||||
|
||||
Add the flow [`01_getting_started_data_pipeline.yaml`](flows/01_getting_started_data_pipeline.yaml) from the UI if you haven't already and execute it to see the results. Inspect the Gantt and Logs tabs to understand the flow execution.
|
||||
|
||||
### Local DB: Load Taxi Data to Postgres
|
||||
|
||||
Before we start loading data to GCP, we'll first play with the Yellow and Green Taxi data using a local Postgres database running in a Docker container. We'll create a new Postgres database for these examples using this [Docker Compose file](postgres/docker-compose.yml). Download it into a new directory, navigate to it and run the following command to start it:
|
||||
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
The flow will extract CSV data partitioned by year and month, create tables, load data to the monthly table, and finally merge the data to the final destination table.
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
Start[Select Year & Month] --> SetLabel[Set Labels]
|
||||
SetLabel --> Extract[Extract CSV Data]
|
||||
Extract -->|Taxi=Yellow| YellowFinalTable[Create Yellow Final Table]:::yellow
|
||||
Extract -->|Taxi=Green| GreenFinalTable[Create Green Final Table]:::green
|
||||
YellowFinalTable --> YellowMonthlyTable[Create Yellow Monthly Table]:::yellow
|
||||
GreenFinalTable --> GreenMonthlyTable[Create Green Monthly Table]:::green
|
||||
YellowMonthlyTable --> YellowCopyIn[Load Data to Monthly Table]:::yellow
|
||||
GreenMonthlyTable --> GreenCopyIn[Load Data to Monthly Table]:::green
|
||||
YellowCopyIn --> YellowMerge[Merge Yellow Data]:::yellow
|
||||
GreenCopyIn --> GreenMerge[Merge Green Data]:::green
|
||||
|
||||
classDef yellow fill:#FFD700,stroke:#000,stroke-width:1px;
|
||||
classDef green fill:#32CD32,stroke:#000,stroke-width:1px;
|
||||
```
|
||||
|
||||
The flow code: [`02_postgres_taxi.yaml`](flows/02_postgres_taxi.yaml).
|
||||
|
||||
|
||||
> [!NOTE]
|
||||
> The NYC Taxi and Limousine Commission (TLC) Trip Record Data provided on the [nyc.gov](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page) website is currently available only in a Parquet format, but this is NOT the dataset we're going to use in this course. For the purpose of this course, we'll use the **CSV files** available [here on GitHub](https://github.com/DataTalksClub/nyc-tlc-data/releases). This is because the Parquet format can be challenging to understand by newcomers, and we want to make the course as accessible as possible — the CSV format can be easily introspected using tools like Excel or Google Sheets, or even a simple text editor.
|
||||
|
||||
### Local DB: Learn Scheduling and Backfills
|
||||
|
||||
We can now schedule the same pipeline shown above to run daily at 9 AM UTC. We'll also demonstrate how to backfill the data pipeline to run on historical data.
|
||||
|
||||
Note: given the large dataset, we'll backfill only data for the green taxi dataset for the year 2019.
|
||||
|
||||
The flow code: [`02_postgres_taxi_scheduled.yaml`](flows/02_postgres_taxi_scheduled.yaml).
|
||||
|
||||
### Local DB: Orchestrate dbt Models
|
||||
|
||||
Now that we have raw data ingested into a local Postgres database, we can use dbt to transform the data into meaningful insights. The flow will sync the dbt models from Git to Kestra and run the `dbt build` command to build the models.
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
Start[Select dbt command] --> Sync[Sync Namespace Files]
|
||||
Sync --> DbtBuild[Run dbt CLI]
|
||||
```
|
||||
|
||||
The flow code: [`03_postgres_dbt.yaml`](flows/03_postgres_dbt.yaml).
|
||||
|
||||
### Resources
|
||||
- [pgAdmin Download](https://www.pgadmin.org/download/)
|
||||
- [Postgres DB Docker Compose](postgres/docker-compose.yml)
|
||||
|
||||
---
|
||||
|
||||
## 4. ETL Pipelines in Kestra: Google Cloud Platform
|
||||
|
||||
Now that you've learned how to build ETL pipelines locally using Postgres, we are ready to move to the cloud. In this section, we'll load the same Yellow and Green Taxi data to Google Cloud Platform (GCP) using:
|
||||
1. Google Cloud Storage (GCS) as a data lake
|
||||
2. BigQuery as a data warehouse.
|
||||
|
||||
### Videos
|
||||
|
||||
- **2.2.6 - Create an ETL Pipeline with GCS and BigQuery in Kestra**
|
||||
[](https://youtu.be/nKqjjLJ7YXs)
|
||||
- **2.2.7 - Manage Scheduling and Backfills using BigQuery in Kestra**
|
||||
[](https://youtu.be/DoaZ5JWEkH0)
|
||||
- **2.2.8 - Transform Data with dbt and BigQuery in Kestra**
|
||||
[](https://youtu.be/eF_EdV4A1Wk)
|
||||
|
||||
### Setup Google Cloud Platform (GCP)
|
||||
|
||||
Before we start loading data to GCP, we need to set up the Google Cloud Platform.
|
||||
|
||||
First, adjust the following flow [`04_gcp_kv.yaml`](flows/04_gcp_kv.yaml) to include your service account, GCP project ID, BigQuery dataset and GCS bucket name (_along with their location_) as KV Store values:
|
||||
- GCP_CREDS
|
||||
- GCP_PROJECT_ID
|
||||
- GCP_LOCATION
|
||||
- GCP_BUCKET_NAME
|
||||
- GCP_DATASET.
|
||||
|
||||
|
||||
> [!WARNING]
|
||||
> The `GCP_CREDS` service account contains sensitive information. Ensure you keep it secure and do not commit it to Git. Keep it as secure as your passwords.
|
||||
|
||||
### Create GCP Resources
|
||||
|
||||
If you haven't already created the GCS bucket and BigQuery dataset in the first week of the course, you can use this flow to create them: [`05_gcp_setup.yaml`](flows/05_gcp_setup.yaml).
|
||||
|
||||
|
||||
### GCP Workflow: Load Taxi Data to BigQuery
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
SetLabel[Set Labels] --> Extract[Extract CSV Data]
|
||||
Extract --> UploadToGCS[Upload Data to GCS]
|
||||
UploadToGCS -->|Taxi=Yellow| BQYellowTripdata[Main Yellow Tripdata Table]:::yellow
|
||||
UploadToGCS -->|Taxi=Green| BQGreenTripdata[Main Green Tripdata Table]:::green
|
||||
BQYellowTripdata --> BQYellowTableExt[External Table]:::yellow
|
||||
BQGreenTripdata --> BQGreenTableExt[External Table]:::green
|
||||
BQYellowTableExt --> BQYellowTableTmp[Monthly Table]:::yellow
|
||||
BQGreenTableExt --> BQGreenTableTmp[Monthly Table]:::green
|
||||
BQYellowTableTmp --> BQYellowMerge[Merge to Main Table]:::yellow
|
||||
BQGreenTableTmp --> BQGreenMerge[Merge to Main Table]:::green
|
||||
BQYellowMerge --> PurgeFiles[Purge Files]
|
||||
BQGreenMerge --> PurgeFiles[Purge Files]
|
||||
|
||||
classDef yellow fill:#FFD700,stroke:#000,stroke-width:1px;
|
||||
classDef green fill:#32CD32,stroke:#000,stroke-width:1px;
|
||||
```
|
||||
|
||||
The flow code: [`06_gcp_taxi.yaml`](flows/06_gcp_taxi.yaml).
|
||||
|
||||
### GCP Workflow: Schedule and Backfill Full Dataset
|
||||
|
||||
We can now schedule the same pipeline shown above to run daily at 9 AM UTC for the green dataset and at 10 AM UTC for the yellow dataset. You can backfill historical data directly from the Kestra UI.
|
||||
|
||||
Since we now process data in a cloud environment with infinitely scalable storage and compute, we can backfill the entire dataset for both the yellow and green taxi data without the risk of running out of resources on our local machine.
|
||||
|
||||
The flow code: [`06_gcp_taxi_scheduled.yaml`](flows/06_gcp_taxi_scheduled.yaml).
|
||||
|
||||
### GCP Workflow: Orchestrate dbt Models
|
||||
|
||||
Now that we have raw data ingested into BigQuery, we can use dbt to transform that data. The flow will sync the dbt models from Git to Kestra and run the `dbt build` command to build the models:
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
Start[Select dbt command] --> Sync[Sync Namespace Files]
|
||||
Sync --> Build[Run dbt Build Command]
|
||||
```
|
||||
|
||||
The flow code: [`07_gcp_dbt.yaml`](flows/07_gcp_dbt.yaml).
|
||||
|
||||
---
|
||||
|
||||
## 5. Bonus: Deploy to the Cloud
|
||||
|
||||
Now that we've got our ETL pipeline working both locally and in the cloud, we can deploy Kestra to the cloud so it can continue to orchestrate our ETL pipelines monthly with our configured schedules, We'll cover how you can install Kestra on Google Cloud in Production, and automatically sync and deploy your workflows from a Git repository.
|
||||
|
||||
### Videos
|
||||
|
||||
- **2.2.9 - Deploy Workflows to the Cloud with Git**
|
||||
[](https://youtu.be/l-wC71tI3co)
|
||||
|
||||
Resources
|
||||
- [Slides](https://docs.google.com/presentation/d/17zSxG5Z-tidmgY-9l7Al1cPmz4Slh4VPK6o2sryFYvw/)
|
||||
|
||||
### 2.2.2 - 🧙♂️ Intro to Mage
|
||||
- [Install Kestra on Google Cloud](https://go.kestra.io/de-zoomcamp/gcp-install)
|
||||
- [Moving from Development to Production](https://go.kestra.io/de-zoomcamp/dev-to-prod)
|
||||
- [Using Git in Kestra](https://go.kestra.io/de-zoomcamp/git)
|
||||
- [Deploy Flows with GitHub Actions](https://go.kestra.io/de-zoomcamp/deploy-github-actions)
|
||||
|
||||
In this section, we'll introduce the Mage platform. We'll cover what makes Mage different from other orchestrators, the fundamental concepts behind Mage, and how to get started. To cap it off, we'll spin Mage up via Docker 🐳 and run a simple pipeline.
|
||||
## 6. Additional Resources 📚
|
||||
|
||||
Videos
|
||||
- 2.2.2a - What is Mage?
|
||||
|
||||
[](https://youtu.be/AicKRcK3pa4&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=18)
|
||||
|
||||
- 2.2.2b - Configuring Mage
|
||||
|
||||
[](https://youtu.be/tNiV7Wp08XE&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=19)
|
||||
|
||||
- 2.2.2c - A Simple Pipeline
|
||||
|
||||
[](https://youtu.be/stI-gg4QBnI&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=20)
|
||||
|
||||
Resources
|
||||
- [Getting Started Repo](https://github.com/mage-ai/mage-zoomcamp)
|
||||
- [Slides](https://docs.google.com/presentation/d/1y_5p3sxr6Xh1RqE6N8o2280gUzAdiic2hPhYUUD6l88/)
|
||||
|
||||
### 2.2.3 - 🐘 ETL: API to Postgres
|
||||
|
||||
Hooray! Mage is up and running. Now, let's build a _real_ pipeline. In this section, we'll build a simple ETL pipeline that loads data from an API into a Postgres database. Our database will be built using Docker— it will be running locally, but it's the same as if it were running in the cloud.
|
||||
|
||||
Videos
|
||||
- 2.2.3a - Configuring Postgres
|
||||
|
||||
[](https://youtu.be/pmhI-ezd3BE&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=21)
|
||||
|
||||
- 2.2.3b - Writing an ETL Pipeline : API to postgres
|
||||
|
||||
[](https://youtu.be/Maidfe7oKLs&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=22)
|
||||
- Check [Kestra Docs](https://go.kestra.io/de-zoomcamp/docs)
|
||||
- Explore our [Blueprints](https://go.kestra.io/de-zoomcamp/blueprints) library
|
||||
- Browse over 600 [plugins](https://go.kestra.io/de-zoomcamp/plugins) available in Kestra
|
||||
- Give us a star on [GitHub](https://go.kestra.io/de-zoomcamp/github)
|
||||
- Join our [Slack community](https://go.kestra.io/de-zoomcamp/slack) if you have any questions
|
||||
- Find all the videos in this [YouTube Playlist](https://go.kestra.io/de-zoomcamp/yt-playlist)
|
||||
|
||||
|
||||
### 2.2.4 - 🤓 ETL: API to GCS
|
||||
### Troubleshooting tips
|
||||
|
||||
Ok, so we've written data _locally_ to a database, but what about the cloud? In this tutorial, we'll walk through the process of using Mage to extract, transform, and load data from an API to Google Cloud Storage (GCS).
|
||||
If you encounter similar errors to:
|
||||
|
||||
We'll cover both writing _partitioned_ and _unpartitioned_ data to GCS and discuss _why_ you might want to do one over the other. Many data teams start with extracting data from a source and writing it to a data lake _before_ loading it to a structured data source, like a database.
|
||||
```
|
||||
BigQueryError{reason=invalid, location=null,
|
||||
message=Error while reading table: kestra-sandbox.zooomcamp.yellow_tripdata_2020_01,
|
||||
error message: CSV table references column position 17, but line contains only 14 columns.;
|
||||
line_number: 2103925 byte_offset_to_start_of_line: 194863028
|
||||
column_index: 17 column_name: "congestion_surcharge" column_type: NUMERIC
|
||||
File: gs://anna-geller/yellow_tripdata_2020-01.csv}
|
||||
```
|
||||
|
||||
Videos
|
||||
- 2.2.4a - Configuring GCP
|
||||
|
||||
[](https://youtu.be/00LP360iYvE&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=23)
|
||||
|
||||
- 2.2.4b - Writing an ETL Pipeline : API to GCS
|
||||
|
||||
[](https://youtu.be/w0XmcASRUnc&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=24)
|
||||
|
||||
Resources
|
||||
- [DTC Zoomcamp GCP Setup](../01-docker-terraform/1_terraform_gcp/2_gcp_overview.md)
|
||||
|
||||
### 2.2.5 - 🔍 ETL: GCS to BigQuery
|
||||
|
||||
Now that we've written data to GCS, let's load it into BigQuery. In this section, we'll walk through the process of using Mage to load our data from GCS to BigQuery. This closely mirrors a very common data engineering workflow: loading data from a data lake into a data warehouse.
|
||||
|
||||
Videos
|
||||
- 2.2.5a - Writing an ETL Pipeline : GCS to BigQuery
|
||||
|
||||
[](https://youtu.be/JKp_uzM-XsM&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=25)
|
||||
|
||||
### 2.2.6 - 👨💻 Parameterized Execution
|
||||
|
||||
By now you're familiar with building pipelines, but what about adding parameters? In this video, we'll discuss some built-in runtime variables that exist in Mage and show you how to define your own! We'll also cover how to use these variables to parameterize your pipelines. Finally, we'll talk about what it means to *backfill* a pipeline and how to do it in Mage.
|
||||
|
||||
Videos
|
||||
- 2.2.6a - Parameterized Execution
|
||||
|
||||
[](https://youtu.be/H0hWjWxB-rg&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=26)
|
||||
It means that the CSV file you're trying to load into BigQuery has a mismatch in the number of columns between the external source table (i.e. file in GCS) and the destination table in BigQuery. This can happen when for due to network/transfer issues, the file is not fully downloaded from GitHub or not correctly uploaded to GCS. The error suggests schema issues but that's not the case. Simply rerun the entire execution including redownloading the CSV file and reuploading it to GCS. This should resolve the issue.
|
||||
|
||||
|
||||
- 2.2.6b - Backfills
|
||||
|
||||
[](https://youtu.be/ZoeC6Ag5gQc&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=27)
|
||||
|
||||
Resources
|
||||
- [Mage Variables Overview](https://docs.mage.ai/development/variables/overview)
|
||||
- [Mage Runtime Variables](https://docs.mage.ai/getting-started/runtime-variable)
|
||||
|
||||
### 2.2.7 - 🤖 Deployment (Optional)
|
||||
|
||||
In this section, we'll cover deploying Mage using Terraform and Google Cloud. This section is optional— it's not *necessary* to learn Mage, but it might be helpful if you're interested in creating a fully deployed project. If you're using Mage in your final project, you'll need to deploy it to the cloud.
|
||||
|
||||
Videos
|
||||
- 2.2.7a - Deployment Prerequisites
|
||||
|
||||
[](https://youtu.be/zAwAX5sxqsg&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=28)
|
||||
|
||||
- 2.2.7b - Google Cloud Permissions
|
||||
|
||||
[](https://youtu.be/O_H7DCmq2rA&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=29)
|
||||
|
||||
- 2.2.7c - Deploying to Google Cloud - Part 1
|
||||
|
||||
[](https://youtu.be/9A872B5hb_0&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=30)
|
||||
|
||||
- 2.2.7d - Deploying to Google Cloud - Part 2
|
||||
|
||||
[](https://youtu.be/0YExsb2HgLI&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=31)
|
||||
|
||||
Resources
|
||||
- [Installing Terraform](https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli)
|
||||
- [Installing `gcloud` CLI](https://cloud.google.com/sdk/docs/install)
|
||||
- [Mage Terraform Templates](https://github.com/mage-ai/mage-ai-terraform-templates)
|
||||
|
||||
Additional Mage Guides
|
||||
- [Terraform](https://docs.mage.ai/production/deploying-to-cloud/using-terraform)
|
||||
- [Deploying to GCP with Terraform](https://docs.mage.ai/production/deploying-to-cloud/gcp/setup)
|
||||
|
||||
### 2.2.8 - 🗒️ Homework
|
||||
|
||||
We've prepared a short exercise to test you on what you've learned this week. You can find the homework [here](../cohorts/2024/02-workflow-orchestration/homework.md). This follows closely from the contents of the course and shouldn't take more than an hour or two to complete. 😄
|
||||
|
||||
### 2.2.9 - 👣 Next Steps
|
||||
|
||||
Congratulations! You've completed Week 2 of the Data Engineering Zoomcamp. We hope you've enjoyed learning about Mage and that you're excited to use it in your final project. If you have any questions, feel free to reach out to us on Slack. Be sure to check out our "Next Steps" video for some inspiration for the rest of your journey 😄.
|
||||
|
||||
Videos
|
||||
- 2.2.9 - Next Steps
|
||||
|
||||
[](https://youtu.be/uUtj7N0TleQ&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=32)
|
||||
|
||||
Resources
|
||||
- [Slides](https://docs.google.com/presentation/d/1yN-e22VNwezmPfKrZkgXQVrX5owDb285I2HxHWgmAEQ/edit#slide=id.g262fb0d2905_0_12)
|
||||
|
||||
### 📑 Additional Resources
|
||||
|
||||
- [Mage Docs](https://docs.mage.ai/)
|
||||
- [Mage Guides](https://docs.mage.ai/guides)
|
||||
- [Mage Slack](https://www.mage.ai/chat)
|
||||
|
||||
---
|
||||
|
||||
# Community notes
|
||||
|
||||
Did you take notes? You can share them here:
|
||||
Did you take notes? You can share them by creating a PR to this file!
|
||||
|
||||
## 2024 notes
|
||||
|
||||
* [2024 Videos transcripts week 2](https://drive.google.com/drive/folders/1yxT0uMMYKa6YOxanh91wGqmQUMS7yYW7?usp=sharing) by Maria Fisher
|
||||
* [Notes from Jonah Oliver](https://www.jonahboliver.com/blog/de-zc-w2)
|
||||
* [Notes from Linda](https://github.com/inner-outer-space/de-zoomcamp-2024/blob/main/2-workflow-orchestration/readme.md)
|
||||
* [Notes from Kirill](https://github.com/kirill505/data-engineering-zoomcamp/blob/main/02-workflow-orchestration/README.md)
|
||||
* [Notes from Zharko](https://www.zharconsulting.com/contents/data/data-engineering-bootcamp-2024/week-2-ingesting-data-with-mage/)
|
||||
* [Notes from Manuel Guerra)](https://github.com/ManuelGuerra1987/data-engineering-zoomcamp-notes/blob/main/2_Workflow-Orchestration-(Kestra)/README.md)
|
||||
* [Notes from Horeb Seidou](https://www.notion.so/Week-2-Workflow-Orchestration-17129780dc4a80148debf61e6453fffe?pvs=4)
|
||||
* Add your notes above this line
|
||||
|
||||
## 2023 notes
|
||||
---
|
||||
|
||||
See [here](../cohorts/2023/week_2_workflow_orchestration#community-notes)
|
||||
# Previous Cohorts
|
||||
|
||||
* 2022: [notes](../../2022/week_2_data_ingestion#community-notes) and [videos](../../2022/week_2_data_ingestion/)
|
||||
* 2023: [notes](../../2023/week_2_workflow_orchestration#community-notes) and [videos](../../2023/week_2_workflow_orchestration/)
|
||||
* 2024: [notes](../../2024/02-workflow-orchestration#community-notes) and [videos](../../2024/02-workflow-orchestration/)
|
||||
|
||||
## 2022 notes
|
||||
|
||||
See [here](../cohorts/2022/week_2_data_ingestion#community-notes)
|
||||
|
||||
62
02-workflow-orchestration/docker-compose.yml
Normal file
62
02-workflow-orchestration/docker-compose.yml
Normal file
@ -0,0 +1,62 @@
|
||||
volumes:
|
||||
postgres-data:
|
||||
driver: local
|
||||
kestra-data:
|
||||
driver: local
|
||||
|
||||
services:
|
||||
postgres:
|
||||
image: postgres
|
||||
volumes:
|
||||
- postgres-data:/var/lib/postgresql/data
|
||||
environment:
|
||||
POSTGRES_DB: kestra
|
||||
POSTGRES_USER: kestra
|
||||
POSTGRES_PASSWORD: k3str4
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -d $${POSTGRES_DB} -U $${POSTGRES_USER}"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 10
|
||||
|
||||
kestra:
|
||||
image: kestra/kestra:develop
|
||||
pull_policy: always
|
||||
user: "root"
|
||||
command: server standalone
|
||||
volumes:
|
||||
- kestra-data:/app/storage
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
- /tmp/kestra-wd:/tmp/kestra-wd
|
||||
environment:
|
||||
KESTRA_CONFIGURATION: |
|
||||
datasources:
|
||||
postgres:
|
||||
url: jdbc:postgresql://postgres:5432/kestra
|
||||
driverClassName: org.postgresql.Driver
|
||||
username: kestra
|
||||
password: k3str4
|
||||
kestra:
|
||||
server:
|
||||
basicAuth:
|
||||
enabled: false
|
||||
username: "admin@kestra.io" # it must be a valid email address
|
||||
password: kestra
|
||||
repository:
|
||||
type: postgres
|
||||
storage:
|
||||
type: local
|
||||
local:
|
||||
basePath: "/app/storage"
|
||||
queue:
|
||||
type: postgres
|
||||
tasks:
|
||||
tmpDir:
|
||||
path: /tmp/kestra-wd/tmp
|
||||
url: http://localhost:8080/
|
||||
ports:
|
||||
- "8080:8080"
|
||||
- "8081:8081"
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_started
|
||||
@ -0,0 +1,55 @@
|
||||
id: 01_getting_started_data_pipeline
|
||||
namespace: zoomcamp
|
||||
|
||||
inputs:
|
||||
- id: columns_to_keep
|
||||
type: ARRAY
|
||||
itemType: STRING
|
||||
defaults:
|
||||
- brand
|
||||
- price
|
||||
|
||||
tasks:
|
||||
- id: extract
|
||||
type: io.kestra.plugin.core.http.Download
|
||||
uri: https://dummyjson.com/products
|
||||
|
||||
- id: transform
|
||||
type: io.kestra.plugin.scripts.python.Script
|
||||
containerImage: python:3.11-alpine
|
||||
inputFiles:
|
||||
data.json: "{{outputs.extract.uri}}"
|
||||
outputFiles:
|
||||
- "*.json"
|
||||
env:
|
||||
COLUMNS_TO_KEEP: "{{inputs.columns_to_keep}}"
|
||||
script: |
|
||||
import json
|
||||
import os
|
||||
|
||||
columns_to_keep_str = os.getenv("COLUMNS_TO_KEEP")
|
||||
columns_to_keep = json.loads(columns_to_keep_str)
|
||||
|
||||
with open("data.json", "r") as file:
|
||||
data = json.load(file)
|
||||
|
||||
filtered_data = [
|
||||
{column: product.get(column, "N/A") for column in columns_to_keep}
|
||||
for product in data["products"]
|
||||
]
|
||||
|
||||
with open("products.json", "w") as file:
|
||||
json.dump(filtered_data, file, indent=4)
|
||||
|
||||
- id: query
|
||||
type: io.kestra.plugin.jdbc.duckdb.Query
|
||||
inputFiles:
|
||||
products.json: "{{outputs.transform.outputFiles['products.json']}}"
|
||||
sql: |
|
||||
INSTALL json;
|
||||
LOAD json;
|
||||
SELECT brand, round(avg(price), 2) as avg_price
|
||||
FROM read_json_auto('{{workingDir}}/products.json')
|
||||
GROUP BY brand
|
||||
ORDER BY avg_price DESC;
|
||||
fetchType: STORE
|
||||
270
02-workflow-orchestration/flows/02_postgres_taxi.yaml
Normal file
270
02-workflow-orchestration/flows/02_postgres_taxi.yaml
Normal file
@ -0,0 +1,270 @@
|
||||
id: 02_postgres_taxi
|
||||
namespace: zoomcamp
|
||||
description: |
|
||||
The CSV Data used in the course: https://github.com/DataTalksClub/nyc-tlc-data/releases
|
||||
|
||||
inputs:
|
||||
- id: taxi
|
||||
type: SELECT
|
||||
displayName: Select taxi type
|
||||
values: [yellow, green]
|
||||
defaults: yellow
|
||||
|
||||
- id: year
|
||||
type: SELECT
|
||||
displayName: Select year
|
||||
values: ["2019", "2020"]
|
||||
defaults: "2019"
|
||||
|
||||
- id: month
|
||||
type: SELECT
|
||||
displayName: Select month
|
||||
values: ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
|
||||
defaults: "01"
|
||||
|
||||
variables:
|
||||
file: "{{inputs.taxi}}_tripdata_{{inputs.year}}-{{inputs.month}}.csv"
|
||||
staging_table: "public.{{inputs.taxi}}_tripdata_staging"
|
||||
table: "public.{{inputs.taxi}}_tripdata"
|
||||
data: "{{outputs.extract.outputFiles[inputs.taxi ~ '_tripdata_' ~ inputs.year ~ '-' ~ inputs.month ~ '.csv']}}"
|
||||
|
||||
tasks:
|
||||
- id: set_label
|
||||
type: io.kestra.plugin.core.execution.Labels
|
||||
labels:
|
||||
file: "{{render(vars.file)}}"
|
||||
taxi: "{{inputs.taxi}}"
|
||||
|
||||
- id: extract
|
||||
type: io.kestra.plugin.scripts.shell.Commands
|
||||
outputFiles:
|
||||
- "*.csv"
|
||||
taskRunner:
|
||||
type: io.kestra.plugin.core.runner.Process
|
||||
commands:
|
||||
- wget -qO- https://github.com/DataTalksClub/nyc-tlc-data/releases/download/{{inputs.taxi}}/{{render(vars.file)}}.gz | gunzip > {{render(vars.file)}}
|
||||
|
||||
- id: if_yellow_taxi
|
||||
type: io.kestra.plugin.core.flow.If
|
||||
condition: "{{inputs.taxi == 'yellow'}}"
|
||||
then:
|
||||
- id: yellow_create_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS {{render(vars.table)}} (
|
||||
unique_row_id text,
|
||||
filename text,
|
||||
VendorID text,
|
||||
tpep_pickup_datetime timestamp,
|
||||
tpep_dropoff_datetime timestamp,
|
||||
passenger_count integer,
|
||||
trip_distance double precision,
|
||||
RatecodeID text,
|
||||
store_and_fwd_flag text,
|
||||
PULocationID text,
|
||||
DOLocationID text,
|
||||
payment_type integer,
|
||||
fare_amount double precision,
|
||||
extra double precision,
|
||||
mta_tax double precision,
|
||||
tip_amount double precision,
|
||||
tolls_amount double precision,
|
||||
improvement_surcharge double precision,
|
||||
total_amount double precision,
|
||||
congestion_surcharge double precision
|
||||
);
|
||||
|
||||
- id: yellow_create_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS {{render(vars.staging_table)}} (
|
||||
unique_row_id text,
|
||||
filename text,
|
||||
VendorID text,
|
||||
tpep_pickup_datetime timestamp,
|
||||
tpep_dropoff_datetime timestamp,
|
||||
passenger_count integer,
|
||||
trip_distance double precision,
|
||||
RatecodeID text,
|
||||
store_and_fwd_flag text,
|
||||
PULocationID text,
|
||||
DOLocationID text,
|
||||
payment_type integer,
|
||||
fare_amount double precision,
|
||||
extra double precision,
|
||||
mta_tax double precision,
|
||||
tip_amount double precision,
|
||||
tolls_amount double precision,
|
||||
improvement_surcharge double precision,
|
||||
total_amount double precision,
|
||||
congestion_surcharge double precision
|
||||
);
|
||||
|
||||
- id: yellow_truncate_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
TRUNCATE TABLE {{render(vars.staging_table)}};
|
||||
|
||||
- id: yellow_copy_in_to_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.CopyIn
|
||||
format: CSV
|
||||
from: "{{render(vars.data)}}"
|
||||
table: "{{render(vars.staging_table)}}"
|
||||
header: true
|
||||
columns: [VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge]
|
||||
|
||||
- id: yellow_add_unique_id_and_filename
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
UPDATE {{render(vars.staging_table)}}
|
||||
SET
|
||||
unique_row_id = md5(
|
||||
COALESCE(CAST(VendorID AS text), '') ||
|
||||
COALESCE(CAST(tpep_pickup_datetime AS text), '') ||
|
||||
COALESCE(CAST(tpep_dropoff_datetime AS text), '') ||
|
||||
COALESCE(PULocationID, '') ||
|
||||
COALESCE(DOLocationID, '') ||
|
||||
COALESCE(CAST(fare_amount AS text), '') ||
|
||||
COALESCE(CAST(trip_distance AS text), '')
|
||||
),
|
||||
filename = '{{render(vars.file)}}';
|
||||
|
||||
- id: yellow_merge_data
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
MERGE INTO {{render(vars.table)}} AS T
|
||||
USING {{render(vars.staging_table)}} AS S
|
||||
ON T.unique_row_id = S.unique_row_id
|
||||
WHEN NOT MATCHED THEN
|
||||
INSERT (
|
||||
unique_row_id, filename, VendorID, tpep_pickup_datetime, tpep_dropoff_datetime,
|
||||
passenger_count, trip_distance, RatecodeID, store_and_fwd_flag, PULocationID,
|
||||
DOLocationID, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount,
|
||||
improvement_surcharge, total_amount, congestion_surcharge
|
||||
)
|
||||
VALUES (
|
||||
S.unique_row_id, S.filename, S.VendorID, S.tpep_pickup_datetime, S.tpep_dropoff_datetime,
|
||||
S.passenger_count, S.trip_distance, S.RatecodeID, S.store_and_fwd_flag, S.PULocationID,
|
||||
S.DOLocationID, S.payment_type, S.fare_amount, S.extra, S.mta_tax, S.tip_amount, S.tolls_amount,
|
||||
S.improvement_surcharge, S.total_amount, S.congestion_surcharge
|
||||
);
|
||||
|
||||
- id: if_green_taxi
|
||||
type: io.kestra.plugin.core.flow.If
|
||||
condition: "{{inputs.taxi == 'green'}}"
|
||||
then:
|
||||
- id: green_create_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS {{render(vars.table)}} (
|
||||
unique_row_id text,
|
||||
filename text,
|
||||
VendorID text,
|
||||
lpep_pickup_datetime timestamp,
|
||||
lpep_dropoff_datetime timestamp,
|
||||
store_and_fwd_flag text,
|
||||
RatecodeID text,
|
||||
PULocationID text,
|
||||
DOLocationID text,
|
||||
passenger_count integer,
|
||||
trip_distance double precision,
|
||||
fare_amount double precision,
|
||||
extra double precision,
|
||||
mta_tax double precision,
|
||||
tip_amount double precision,
|
||||
tolls_amount double precision,
|
||||
ehail_fee double precision,
|
||||
improvement_surcharge double precision,
|
||||
total_amount double precision,
|
||||
payment_type integer,
|
||||
trip_type integer,
|
||||
congestion_surcharge double precision
|
||||
);
|
||||
|
||||
- id: green_create_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS {{render(vars.staging_table)}} (
|
||||
unique_row_id text,
|
||||
filename text,
|
||||
VendorID text,
|
||||
lpep_pickup_datetime timestamp,
|
||||
lpep_dropoff_datetime timestamp,
|
||||
store_and_fwd_flag text,
|
||||
RatecodeID text,
|
||||
PULocationID text,
|
||||
DOLocationID text,
|
||||
passenger_count integer,
|
||||
trip_distance double precision,
|
||||
fare_amount double precision,
|
||||
extra double precision,
|
||||
mta_tax double precision,
|
||||
tip_amount double precision,
|
||||
tolls_amount double precision,
|
||||
ehail_fee double precision,
|
||||
improvement_surcharge double precision,
|
||||
total_amount double precision,
|
||||
payment_type integer,
|
||||
trip_type integer,
|
||||
congestion_surcharge double precision
|
||||
);
|
||||
|
||||
- id: green_truncate_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
TRUNCATE TABLE {{render(vars.staging_table)}};
|
||||
|
||||
- id: green_copy_in_to_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.CopyIn
|
||||
format: CSV
|
||||
from: "{{render(vars.data)}}"
|
||||
table: "{{render(vars.staging_table)}}"
|
||||
header: true
|
||||
columns: [VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge]
|
||||
|
||||
- id: green_add_unique_id_and_filename
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
UPDATE {{render(vars.staging_table)}}
|
||||
SET
|
||||
unique_row_id = md5(
|
||||
COALESCE(CAST(VendorID AS text), '') ||
|
||||
COALESCE(CAST(lpep_pickup_datetime AS text), '') ||
|
||||
COALESCE(CAST(lpep_dropoff_datetime AS text), '') ||
|
||||
COALESCE(PULocationID, '') ||
|
||||
COALESCE(DOLocationID, '') ||
|
||||
COALESCE(CAST(fare_amount AS text), '') ||
|
||||
COALESCE(CAST(trip_distance AS text), '')
|
||||
),
|
||||
filename = '{{render(vars.file)}}';
|
||||
|
||||
- id: green_merge_data
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
MERGE INTO {{render(vars.table)}} AS T
|
||||
USING {{render(vars.staging_table)}} AS S
|
||||
ON T.unique_row_id = S.unique_row_id
|
||||
WHEN NOT MATCHED THEN
|
||||
INSERT (
|
||||
unique_row_id, filename, VendorID, lpep_pickup_datetime, lpep_dropoff_datetime,
|
||||
store_and_fwd_flag, RatecodeID, PULocationID, DOLocationID, passenger_count,
|
||||
trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee,
|
||||
improvement_surcharge, total_amount, payment_type, trip_type, congestion_surcharge
|
||||
)
|
||||
VALUES (
|
||||
S.unique_row_id, S.filename, S.VendorID, S.lpep_pickup_datetime, S.lpep_dropoff_datetime,
|
||||
S.store_and_fwd_flag, S.RatecodeID, S.PULocationID, S.DOLocationID, S.passenger_count,
|
||||
S.trip_distance, S.fare_amount, S.extra, S.mta_tax, S.tip_amount, S.tolls_amount, S.ehail_fee,
|
||||
S.improvement_surcharge, S.total_amount, S.payment_type, S.trip_type, S.congestion_surcharge
|
||||
);
|
||||
|
||||
- id: purge_files
|
||||
type: io.kestra.plugin.core.storage.PurgeCurrentExecutionFiles
|
||||
description: This will remove output files. If you'd like to explore Kestra outputs, disable it.
|
||||
|
||||
pluginDefaults:
|
||||
- type: io.kestra.plugin.jdbc.postgresql
|
||||
values:
|
||||
url: jdbc:postgresql://host.docker.internal:5432/postgres-zoomcamp
|
||||
username: kestra
|
||||
password: k3str4
|
||||
275
02-workflow-orchestration/flows/02_postgres_taxi_scheduled.yaml
Normal file
275
02-workflow-orchestration/flows/02_postgres_taxi_scheduled.yaml
Normal file
@ -0,0 +1,275 @@
|
||||
id: 02_postgres_taxi_scheduled
|
||||
namespace: zoomcamp
|
||||
description: |
|
||||
Best to add a label `backfill:true` from the UI to track executions created via a backfill.
|
||||
CSV data used here comes from: https://github.com/DataTalksClub/nyc-tlc-data/releases
|
||||
|
||||
concurrency:
|
||||
limit: 1
|
||||
|
||||
inputs:
|
||||
- id: taxi
|
||||
type: SELECT
|
||||
displayName: Select taxi type
|
||||
values: [yellow, green]
|
||||
defaults: yellow
|
||||
|
||||
variables:
|
||||
file: "{{inputs.taxi}}_tripdata_{{trigger.date | date('yyyy-MM')}}.csv"
|
||||
staging_table: "public.{{inputs.taxi}}_tripdata_staging"
|
||||
table: "public.{{inputs.taxi}}_tripdata"
|
||||
data: "{{outputs.extract.outputFiles[inputs.taxi ~ '_tripdata_' ~ (trigger.date | date('yyyy-MM')) ~ '.csv']}}"
|
||||
|
||||
tasks:
|
||||
- id: set_label
|
||||
type: io.kestra.plugin.core.execution.Labels
|
||||
labels:
|
||||
file: "{{render(vars.file)}}"
|
||||
taxi: "{{inputs.taxi}}"
|
||||
|
||||
- id: extract
|
||||
type: io.kestra.plugin.scripts.shell.Commands
|
||||
outputFiles:
|
||||
- "*.csv"
|
||||
taskRunner:
|
||||
type: io.kestra.plugin.core.runner.Process
|
||||
commands:
|
||||
- wget -qO- https://github.com/DataTalksClub/nyc-tlc-data/releases/download/{{inputs.taxi}}/{{render(vars.file)}}.gz | gunzip > {{render(vars.file)}}
|
||||
|
||||
- id: if_yellow_taxi
|
||||
type: io.kestra.plugin.core.flow.If
|
||||
condition: "{{inputs.taxi == 'yellow'}}"
|
||||
then:
|
||||
- id: yellow_create_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS {{render(vars.table)}} (
|
||||
unique_row_id text,
|
||||
filename text,
|
||||
VendorID text,
|
||||
tpep_pickup_datetime timestamp,
|
||||
tpep_dropoff_datetime timestamp,
|
||||
passenger_count integer,
|
||||
trip_distance double precision,
|
||||
RatecodeID text,
|
||||
store_and_fwd_flag text,
|
||||
PULocationID text,
|
||||
DOLocationID text,
|
||||
payment_type integer,
|
||||
fare_amount double precision,
|
||||
extra double precision,
|
||||
mta_tax double precision,
|
||||
tip_amount double precision,
|
||||
tolls_amount double precision,
|
||||
improvement_surcharge double precision,
|
||||
total_amount double precision,
|
||||
congestion_surcharge double precision
|
||||
);
|
||||
|
||||
- id: yellow_create_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS {{render(vars.staging_table)}} (
|
||||
unique_row_id text,
|
||||
filename text,
|
||||
VendorID text,
|
||||
tpep_pickup_datetime timestamp,
|
||||
tpep_dropoff_datetime timestamp,
|
||||
passenger_count integer,
|
||||
trip_distance double precision,
|
||||
RatecodeID text,
|
||||
store_and_fwd_flag text,
|
||||
PULocationID text,
|
||||
DOLocationID text,
|
||||
payment_type integer,
|
||||
fare_amount double precision,
|
||||
extra double precision,
|
||||
mta_tax double precision,
|
||||
tip_amount double precision,
|
||||
tolls_amount double precision,
|
||||
improvement_surcharge double precision,
|
||||
total_amount double precision,
|
||||
congestion_surcharge double precision
|
||||
);
|
||||
|
||||
- id: yellow_truncate_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
TRUNCATE TABLE {{render(vars.staging_table)}};
|
||||
|
||||
- id: yellow_copy_in_to_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.CopyIn
|
||||
format: CSV
|
||||
from: "{{render(vars.data)}}"
|
||||
table: "{{render(vars.staging_table)}}"
|
||||
header: true
|
||||
columns: [VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge]
|
||||
|
||||
- id: yellow_add_unique_id_and_filename
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
UPDATE {{render(vars.staging_table)}}
|
||||
SET
|
||||
unique_row_id = md5(
|
||||
COALESCE(CAST(VendorID AS text), '') ||
|
||||
COALESCE(CAST(tpep_pickup_datetime AS text), '') ||
|
||||
COALESCE(CAST(tpep_dropoff_datetime AS text), '') ||
|
||||
COALESCE(PULocationID, '') ||
|
||||
COALESCE(DOLocationID, '') ||
|
||||
COALESCE(CAST(fare_amount AS text), '') ||
|
||||
COALESCE(CAST(trip_distance AS text), '')
|
||||
),
|
||||
filename = '{{render(vars.file)}}';
|
||||
|
||||
- id: yellow_merge_data
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
MERGE INTO {{render(vars.table)}} AS T
|
||||
USING {{render(vars.staging_table)}} AS S
|
||||
ON T.unique_row_id = S.unique_row_id
|
||||
WHEN NOT MATCHED THEN
|
||||
INSERT (
|
||||
unique_row_id, filename, VendorID, tpep_pickup_datetime, tpep_dropoff_datetime,
|
||||
passenger_count, trip_distance, RatecodeID, store_and_fwd_flag, PULocationID,
|
||||
DOLocationID, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount,
|
||||
improvement_surcharge, total_amount, congestion_surcharge
|
||||
)
|
||||
VALUES (
|
||||
S.unique_row_id, S.filename, S.VendorID, S.tpep_pickup_datetime, S.tpep_dropoff_datetime,
|
||||
S.passenger_count, S.trip_distance, S.RatecodeID, S.store_and_fwd_flag, S.PULocationID,
|
||||
S.DOLocationID, S.payment_type, S.fare_amount, S.extra, S.mta_tax, S.tip_amount, S.tolls_amount,
|
||||
S.improvement_surcharge, S.total_amount, S.congestion_surcharge
|
||||
);
|
||||
|
||||
- id: if_green_taxi
|
||||
type: io.kestra.plugin.core.flow.If
|
||||
condition: "{{inputs.taxi == 'green'}}"
|
||||
then:
|
||||
- id: green_create_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS {{render(vars.table)}} (
|
||||
unique_row_id text,
|
||||
filename text,
|
||||
VendorID text,
|
||||
lpep_pickup_datetime timestamp,
|
||||
lpep_dropoff_datetime timestamp,
|
||||
store_and_fwd_flag text,
|
||||
RatecodeID text,
|
||||
PULocationID text,
|
||||
DOLocationID text,
|
||||
passenger_count integer,
|
||||
trip_distance double precision,
|
||||
fare_amount double precision,
|
||||
extra double precision,
|
||||
mta_tax double precision,
|
||||
tip_amount double precision,
|
||||
tolls_amount double precision,
|
||||
ehail_fee double precision,
|
||||
improvement_surcharge double precision,
|
||||
total_amount double precision,
|
||||
payment_type integer,
|
||||
trip_type integer,
|
||||
congestion_surcharge double precision
|
||||
);
|
||||
|
||||
- id: green_create_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS {{render(vars.staging_table)}} (
|
||||
unique_row_id text,
|
||||
filename text,
|
||||
VendorID text,
|
||||
lpep_pickup_datetime timestamp,
|
||||
lpep_dropoff_datetime timestamp,
|
||||
store_and_fwd_flag text,
|
||||
RatecodeID text,
|
||||
PULocationID text,
|
||||
DOLocationID text,
|
||||
passenger_count integer,
|
||||
trip_distance double precision,
|
||||
fare_amount double precision,
|
||||
extra double precision,
|
||||
mta_tax double precision,
|
||||
tip_amount double precision,
|
||||
tolls_amount double precision,
|
||||
ehail_fee double precision,
|
||||
improvement_surcharge double precision,
|
||||
total_amount double precision,
|
||||
payment_type integer,
|
||||
trip_type integer,
|
||||
congestion_surcharge double precision
|
||||
);
|
||||
|
||||
- id: green_truncate_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
TRUNCATE TABLE {{render(vars.staging_table)}};
|
||||
|
||||
- id: green_copy_in_to_staging_table
|
||||
type: io.kestra.plugin.jdbc.postgresql.CopyIn
|
||||
format: CSV
|
||||
from: "{{render(vars.data)}}"
|
||||
table: "{{render(vars.staging_table)}}"
|
||||
header: true
|
||||
columns: [VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge]
|
||||
|
||||
- id: green_add_unique_id_and_filename
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
UPDATE {{render(vars.staging_table)}}
|
||||
SET
|
||||
unique_row_id = md5(
|
||||
COALESCE(CAST(VendorID AS text), '') ||
|
||||
COALESCE(CAST(lpep_pickup_datetime AS text), '') ||
|
||||
COALESCE(CAST(lpep_dropoff_datetime AS text), '') ||
|
||||
COALESCE(PULocationID, '') ||
|
||||
COALESCE(DOLocationID, '') ||
|
||||
COALESCE(CAST(fare_amount AS text), '') ||
|
||||
COALESCE(CAST(trip_distance AS text), '')
|
||||
),
|
||||
filename = '{{render(vars.file)}}';
|
||||
|
||||
- id: green_merge_data
|
||||
type: io.kestra.plugin.jdbc.postgresql.Queries
|
||||
sql: |
|
||||
MERGE INTO {{render(vars.table)}} AS T
|
||||
USING {{render(vars.staging_table)}} AS S
|
||||
ON T.unique_row_id = S.unique_row_id
|
||||
WHEN NOT MATCHED THEN
|
||||
INSERT (
|
||||
unique_row_id, filename, VendorID, lpep_pickup_datetime, lpep_dropoff_datetime,
|
||||
store_and_fwd_flag, RatecodeID, PULocationID, DOLocationID, passenger_count,
|
||||
trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee,
|
||||
improvement_surcharge, total_amount, payment_type, trip_type, congestion_surcharge
|
||||
)
|
||||
VALUES (
|
||||
S.unique_row_id, S.filename, S.VendorID, S.lpep_pickup_datetime, S.lpep_dropoff_datetime,
|
||||
S.store_and_fwd_flag, S.RatecodeID, S.PULocationID, S.DOLocationID, S.passenger_count,
|
||||
S.trip_distance, S.fare_amount, S.extra, S.mta_tax, S.tip_amount, S.tolls_amount, S.ehail_fee,
|
||||
S.improvement_surcharge, S.total_amount, S.payment_type, S.trip_type, S.congestion_surcharge
|
||||
);
|
||||
|
||||
- id: purge_files
|
||||
type: io.kestra.plugin.core.storage.PurgeCurrentExecutionFiles
|
||||
description: To avoid cluttering your storage, we will remove the downloaded files
|
||||
|
||||
pluginDefaults:
|
||||
- type: io.kestra.plugin.jdbc.postgresql
|
||||
values:
|
||||
url: jdbc:postgresql://host.docker.internal:5432/postgres-zoomcamp
|
||||
username: kestra
|
||||
password: k3str4
|
||||
|
||||
triggers:
|
||||
- id: green_schedule
|
||||
type: io.kestra.plugin.core.trigger.Schedule
|
||||
cron: "0 9 1 * *"
|
||||
inputs:
|
||||
taxi: green
|
||||
|
||||
- id: yellow_schedule
|
||||
type: io.kestra.plugin.core.trigger.Schedule
|
||||
cron: "0 10 1 * *"
|
||||
inputs:
|
||||
taxi: yellow
|
||||
59
02-workflow-orchestration/flows/03_postgres_dbt.yaml
Normal file
59
02-workflow-orchestration/flows/03_postgres_dbt.yaml
Normal file
@ -0,0 +1,59 @@
|
||||
id: 03_postgres_dbt
|
||||
namespace: zoomcamp
|
||||
inputs:
|
||||
- id: dbt_command
|
||||
type: SELECT
|
||||
allowCustomValue: true
|
||||
defaults: dbt build
|
||||
values:
|
||||
- dbt build
|
||||
- dbt debug # use when running the first time to validate DB connection
|
||||
tasks:
|
||||
- id: sync
|
||||
type: io.kestra.plugin.git.SyncNamespaceFiles
|
||||
url: https://github.com/DataTalksClub/data-engineering-zoomcamp
|
||||
branch: main
|
||||
namespace: "{{ flow.namespace }}"
|
||||
gitDirectory: 04-analytics-engineering/taxi_rides_ny
|
||||
dryRun: false
|
||||
# disabled: true # this Git Sync is needed only when running it the first time, afterwards the task can be disabled
|
||||
|
||||
- id: dbt-build
|
||||
type: io.kestra.plugin.dbt.cli.DbtCLI
|
||||
env:
|
||||
DBT_DATABASE: postgres-zoomcamp
|
||||
DBT_SCHEMA: public
|
||||
namespaceFiles:
|
||||
enabled: true
|
||||
containerImage: ghcr.io/kestra-io/dbt-postgres:latest
|
||||
taskRunner:
|
||||
type: io.kestra.plugin.scripts.runner.docker.Docker
|
||||
commands:
|
||||
- dbt deps
|
||||
- "{{ inputs.dbt_command }}"
|
||||
storeManifest:
|
||||
key: manifest.json
|
||||
namespace: "{{ flow.namespace }}"
|
||||
profiles: |
|
||||
default:
|
||||
outputs:
|
||||
dev:
|
||||
type: postgres
|
||||
host: host.docker.internal
|
||||
user: kestra
|
||||
password: k3str4
|
||||
port: 5432
|
||||
dbname: postgres-zoomcamp
|
||||
schema: public
|
||||
threads: 8
|
||||
connect_timeout: 10
|
||||
priority: interactive
|
||||
target: dev
|
||||
description: |
|
||||
Note that you need to adjust the models/staging/schema.yml file to match your database and schema. Select and edit that Namespace File from the UI. Save and run this flow. Once https://github.com/DataTalksClub/data-engineering-zoomcamp/pull/565/files is merged, you can ignore this note as it will be dynamically adjusted based on env variables.
|
||||
```yaml
|
||||
sources:
|
||||
- name: staging
|
||||
database: postgres-zoomcamp
|
||||
schema: public
|
||||
```
|
||||
37
02-workflow-orchestration/flows/04_gcp_kv.yaml
Normal file
37
02-workflow-orchestration/flows/04_gcp_kv.yaml
Normal file
@ -0,0 +1,37 @@
|
||||
id: 04_gcp_kv
|
||||
namespace: zoomcamp
|
||||
|
||||
tasks:
|
||||
- id: gcp_creds
|
||||
type: io.kestra.plugin.core.kv.Set
|
||||
key: GCP_CREDS
|
||||
kvType: JSON
|
||||
value: |
|
||||
{
|
||||
"type": "service_account",
|
||||
"project_id": "...",
|
||||
}
|
||||
|
||||
- id: gcp_project_id
|
||||
type: io.kestra.plugin.core.kv.Set
|
||||
key: GCP_PROJECT_ID
|
||||
kvType: STRING
|
||||
value: kestra-sandbox # TODO replace with your project id
|
||||
|
||||
- id: gcp_location
|
||||
type: io.kestra.plugin.core.kv.Set
|
||||
key: GCP_LOCATION
|
||||
kvType: STRING
|
||||
value: europe-west2
|
||||
|
||||
- id: gcp_bucket_name
|
||||
type: io.kestra.plugin.core.kv.Set
|
||||
key: GCP_BUCKET_NAME
|
||||
kvType: STRING
|
||||
value: your-name-kestra # TODO make sure it's globally unique!
|
||||
|
||||
- id: gcp_dataset
|
||||
type: io.kestra.plugin.core.kv.Set
|
||||
key: GCP_DATASET
|
||||
kvType: STRING
|
||||
value: zoomcamp
|
||||
22
02-workflow-orchestration/flows/05_gcp_setup.yaml
Normal file
22
02-workflow-orchestration/flows/05_gcp_setup.yaml
Normal file
@ -0,0 +1,22 @@
|
||||
id: 05_gcp_setup
|
||||
namespace: zoomcamp
|
||||
|
||||
tasks:
|
||||
- id: create_gcs_bucket
|
||||
type: io.kestra.plugin.gcp.gcs.CreateBucket
|
||||
ifExists: SKIP
|
||||
storageClass: REGIONAL
|
||||
name: "{{kv('GCP_BUCKET_NAME')}}" # make sure it's globally unique!
|
||||
|
||||
- id: create_bq_dataset
|
||||
type: io.kestra.plugin.gcp.bigquery.CreateDataset
|
||||
name: "{{kv('GCP_DATASET')}}"
|
||||
ifExists: SKIP
|
||||
|
||||
pluginDefaults:
|
||||
- type: io.kestra.plugin.gcp
|
||||
values:
|
||||
serviceAccount: "{{kv('GCP_CREDS')}}"
|
||||
projectId: "{{kv('GCP_PROJECT_ID')}}"
|
||||
location: "{{kv('GCP_LOCATION')}}"
|
||||
bucket: "{{kv('GCP_BUCKET_NAME')}}"
|
||||
248
02-workflow-orchestration/flows/06_gcp_taxi.yaml
Normal file
248
02-workflow-orchestration/flows/06_gcp_taxi.yaml
Normal file
@ -0,0 +1,248 @@
|
||||
id: 06_gcp_taxi
|
||||
namespace: zoomcamp
|
||||
description: |
|
||||
The CSV Data used in the course: https://github.com/DataTalksClub/nyc-tlc-data/releases
|
||||
|
||||
inputs:
|
||||
- id: taxi
|
||||
type: SELECT
|
||||
displayName: Select taxi type
|
||||
values: [yellow, green]
|
||||
defaults: green
|
||||
|
||||
- id: year
|
||||
type: SELECT
|
||||
displayName: Select year
|
||||
values: ["2019", "2020"]
|
||||
defaults: "2019"
|
||||
allowCustomValue: true # allows you to type 2021 from the UI for the homework 🤗
|
||||
|
||||
- id: month
|
||||
type: SELECT
|
||||
displayName: Select month
|
||||
values: ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
|
||||
defaults: "01"
|
||||
|
||||
variables:
|
||||
file: "{{inputs.taxi}}_tripdata_{{inputs.year}}-{{inputs.month}}.csv"
|
||||
gcs_file: "gs://{{kv('GCP_BUCKET_NAME')}}/{{vars.file}}"
|
||||
table: "{{kv('GCP_DATASET')}}.{{inputs.taxi}}_tripdata_{{inputs.year}}_{{inputs.month}}"
|
||||
data: "{{outputs.extract.outputFiles[inputs.taxi ~ '_tripdata_' ~ inputs.year ~ '-' ~ inputs.month ~ '.csv']}}"
|
||||
|
||||
tasks:
|
||||
- id: set_label
|
||||
type: io.kestra.plugin.core.execution.Labels
|
||||
labels:
|
||||
file: "{{render(vars.file)}}"
|
||||
taxi: "{{inputs.taxi}}"
|
||||
|
||||
- id: extract
|
||||
type: io.kestra.plugin.scripts.shell.Commands
|
||||
outputFiles:
|
||||
- "*.csv"
|
||||
taskRunner:
|
||||
type: io.kestra.plugin.core.runner.Process
|
||||
commands:
|
||||
- wget -qO- https://github.com/DataTalksClub/nyc-tlc-data/releases/download/{{inputs.taxi}}/{{render(vars.file)}}.gz | gunzip > {{render(vars.file)}}
|
||||
|
||||
- id: upload_to_gcs
|
||||
type: io.kestra.plugin.gcp.gcs.Upload
|
||||
from: "{{render(vars.data)}}"
|
||||
to: "{{render(vars.gcs_file)}}"
|
||||
|
||||
- id: if_yellow_taxi
|
||||
type: io.kestra.plugin.core.flow.If
|
||||
condition: "{{inputs.taxi == 'yellow'}}"
|
||||
then:
|
||||
- id: bq_yellow_tripdata
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS `{{kv('GCP_PROJECT_ID')}}.{{kv('GCP_DATASET')}}.yellow_tripdata`
|
||||
(
|
||||
unique_row_id BYTES OPTIONS (description = 'A unique identifier for the trip, generated by hashing key trip attributes.'),
|
||||
filename STRING OPTIONS (description = 'The source filename from which the trip data was loaded.'),
|
||||
VendorID STRING OPTIONS (description = 'A code indicating the LPEP provider that provided the record. 1= Creative Mobile Technologies, LLC; 2= VeriFone Inc.'),
|
||||
tpep_pickup_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was engaged'),
|
||||
tpep_dropoff_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was disengaged'),
|
||||
passenger_count INTEGER OPTIONS (description = 'The number of passengers in the vehicle. This is a driver-entered value.'),
|
||||
trip_distance NUMERIC OPTIONS (description = 'The elapsed trip distance in miles reported by the taximeter.'),
|
||||
RatecodeID STRING OPTIONS (description = 'The final rate code in effect at the end of the trip. 1= Standard rate 2=JFK 3=Newark 4=Nassau or Westchester 5=Negotiated fare 6=Group ride'),
|
||||
store_and_fwd_flag STRING OPTIONS (description = 'This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka "store and forward," because the vehicle did not have a connection to the server. TRUE = store and forward trip, FALSE = not a store and forward trip'),
|
||||
PULocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was engaged'),
|
||||
DOLocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was disengaged'),
|
||||
payment_type INTEGER OPTIONS (description = 'A numeric code signifying how the passenger paid for the trip. 1= Credit card 2= Cash 3= No charge 4= Dispute 5= Unknown 6= Voided trip'),
|
||||
fare_amount NUMERIC OPTIONS (description = 'The time-and-distance fare calculated by the meter'),
|
||||
extra NUMERIC OPTIONS (description = 'Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges'),
|
||||
mta_tax NUMERIC OPTIONS (description = '$0.50 MTA tax that is automatically triggered based on the metered rate in use'),
|
||||
tip_amount NUMERIC OPTIONS (description = 'Tip amount. This field is automatically populated for credit card tips. Cash tips are not included.'),
|
||||
tolls_amount NUMERIC OPTIONS (description = 'Total amount of all tolls paid in trip.'),
|
||||
improvement_surcharge NUMERIC OPTIONS (description = '$0.30 improvement surcharge assessed on hailed trips at the flag drop. The improvement surcharge began being levied in 2015.'),
|
||||
total_amount NUMERIC OPTIONS (description = 'The total amount charged to passengers. Does not include cash tips.'),
|
||||
congestion_surcharge NUMERIC OPTIONS (description = 'Congestion surcharge applied to trips in congested zones')
|
||||
)
|
||||
PARTITION BY DATE(tpep_pickup_datetime);
|
||||
|
||||
- id: bq_yellow_table_ext
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE OR REPLACE EXTERNAL TABLE `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}_ext`
|
||||
(
|
||||
VendorID STRING OPTIONS (description = 'A code indicating the LPEP provider that provided the record. 1= Creative Mobile Technologies, LLC; 2= VeriFone Inc.'),
|
||||
tpep_pickup_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was engaged'),
|
||||
tpep_dropoff_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was disengaged'),
|
||||
passenger_count INTEGER OPTIONS (description = 'The number of passengers in the vehicle. This is a driver-entered value.'),
|
||||
trip_distance NUMERIC OPTIONS (description = 'The elapsed trip distance in miles reported by the taximeter.'),
|
||||
RatecodeID STRING OPTIONS (description = 'The final rate code in effect at the end of the trip. 1= Standard rate 2=JFK 3=Newark 4=Nassau or Westchester 5=Negotiated fare 6=Group ride'),
|
||||
store_and_fwd_flag STRING OPTIONS (description = 'This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka "store and forward," because the vehicle did not have a connection to the server. TRUE = store and forward trip, FALSE = not a store and forward trip'),
|
||||
PULocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was engaged'),
|
||||
DOLocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was disengaged'),
|
||||
payment_type INTEGER OPTIONS (description = 'A numeric code signifying how the passenger paid for the trip. 1= Credit card 2= Cash 3= No charge 4= Dispute 5= Unknown 6= Voided trip'),
|
||||
fare_amount NUMERIC OPTIONS (description = 'The time-and-distance fare calculated by the meter'),
|
||||
extra NUMERIC OPTIONS (description = 'Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges'),
|
||||
mta_tax NUMERIC OPTIONS (description = '$0.50 MTA tax that is automatically triggered based on the metered rate in use'),
|
||||
tip_amount NUMERIC OPTIONS (description = 'Tip amount. This field is automatically populated for credit card tips. Cash tips are not included.'),
|
||||
tolls_amount NUMERIC OPTIONS (description = 'Total amount of all tolls paid in trip.'),
|
||||
improvement_surcharge NUMERIC OPTIONS (description = '$0.30 improvement surcharge assessed on hailed trips at the flag drop. The improvement surcharge began being levied in 2015.'),
|
||||
total_amount NUMERIC OPTIONS (description = 'The total amount charged to passengers. Does not include cash tips.'),
|
||||
congestion_surcharge NUMERIC OPTIONS (description = 'Congestion surcharge applied to trips in congested zones')
|
||||
)
|
||||
OPTIONS (
|
||||
format = 'CSV',
|
||||
uris = ['{{render(vars.gcs_file)}}'],
|
||||
skip_leading_rows = 1,
|
||||
ignore_unknown_values = TRUE
|
||||
);
|
||||
|
||||
- id: bq_yellow_table_tmp
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE OR REPLACE TABLE `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}`
|
||||
AS
|
||||
SELECT
|
||||
MD5(CONCAT(
|
||||
COALESCE(CAST(VendorID AS STRING), ""),
|
||||
COALESCE(CAST(tpep_pickup_datetime AS STRING), ""),
|
||||
COALESCE(CAST(tpep_dropoff_datetime AS STRING), ""),
|
||||
COALESCE(CAST(PULocationID AS STRING), ""),
|
||||
COALESCE(CAST(DOLocationID AS STRING), "")
|
||||
)) AS unique_row_id,
|
||||
"{{render(vars.file)}}" AS filename,
|
||||
*
|
||||
FROM `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}_ext`;
|
||||
|
||||
- id: bq_yellow_merge
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
MERGE INTO `{{kv('GCP_PROJECT_ID')}}.{{kv('GCP_DATASET')}}.yellow_tripdata` T
|
||||
USING `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}` S
|
||||
ON T.unique_row_id = S.unique_row_id
|
||||
WHEN NOT MATCHED THEN
|
||||
INSERT (unique_row_id, filename, VendorID, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, RatecodeID, store_and_fwd_flag, PULocationID, DOLocationID, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, congestion_surcharge)
|
||||
VALUES (S.unique_row_id, S.filename, S.VendorID, S.tpep_pickup_datetime, S.tpep_dropoff_datetime, S.passenger_count, S.trip_distance, S.RatecodeID, S.store_and_fwd_flag, S.PULocationID, S.DOLocationID, S.payment_type, S.fare_amount, S.extra, S.mta_tax, S.tip_amount, S.tolls_amount, S.improvement_surcharge, S.total_amount, S.congestion_surcharge);
|
||||
|
||||
- id: if_green_taxi
|
||||
type: io.kestra.plugin.core.flow.If
|
||||
condition: "{{inputs.taxi == 'green'}}"
|
||||
then:
|
||||
- id: bq_green_tripdata
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS `{{kv('GCP_PROJECT_ID')}}.{{kv('GCP_DATASET')}}.green_tripdata`
|
||||
(
|
||||
unique_row_id BYTES OPTIONS (description = 'A unique identifier for the trip, generated by hashing key trip attributes.'),
|
||||
filename STRING OPTIONS (description = 'The source filename from which the trip data was loaded.'),
|
||||
VendorID STRING OPTIONS (description = 'A code indicating the LPEP provider that provided the record. 1= Creative Mobile Technologies, LLC; 2= VeriFone Inc.'),
|
||||
lpep_pickup_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was engaged'),
|
||||
lpep_dropoff_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was disengaged'),
|
||||
store_and_fwd_flag STRING OPTIONS (description = 'This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka "store and forward," because the vehicle did not have a connection to the server. Y= store and forward trip N= not a store and forward trip'),
|
||||
RatecodeID STRING OPTIONS (description = 'The final rate code in effect at the end of the trip. 1= Standard rate 2=JFK 3=Newark 4=Nassau or Westchester 5=Negotiated fare 6=Group ride'),
|
||||
PULocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was engaged'),
|
||||
DOLocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was disengaged'),
|
||||
passenger_count INT64 OPTIONS (description = 'The number of passengers in the vehicle. This is a driver-entered value.'),
|
||||
trip_distance NUMERIC OPTIONS (description = 'The elapsed trip distance in miles reported by the taximeter.'),
|
||||
fare_amount NUMERIC OPTIONS (description = 'The time-and-distance fare calculated by the meter'),
|
||||
extra NUMERIC OPTIONS (description = 'Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges'),
|
||||
mta_tax NUMERIC OPTIONS (description = '$0.50 MTA tax that is automatically triggered based on the metered rate in use'),
|
||||
tip_amount NUMERIC OPTIONS (description = 'Tip amount. This field is automatically populated for credit card tips. Cash tips are not included.'),
|
||||
tolls_amount NUMERIC OPTIONS (description = 'Total amount of all tolls paid in trip.'),
|
||||
ehail_fee NUMERIC,
|
||||
improvement_surcharge NUMERIC OPTIONS (description = '$0.30 improvement surcharge assessed on hailed trips at the flag drop. The improvement surcharge began being levied in 2015.'),
|
||||
total_amount NUMERIC OPTIONS (description = 'The total amount charged to passengers. Does not include cash tips.'),
|
||||
payment_type INTEGER OPTIONS (description = 'A numeric code signifying how the passenger paid for the trip. 1= Credit card 2= Cash 3= No charge 4= Dispute 5= Unknown 6= Voided trip'),
|
||||
trip_type STRING OPTIONS (description = 'A code indicating whether the trip was a street-hail or a dispatch that is automatically assigned based on the metered rate in use but can be altered by the driver. 1= Street-hail 2= Dispatch'),
|
||||
congestion_surcharge NUMERIC OPTIONS (description = 'Congestion surcharge applied to trips in congested zones')
|
||||
)
|
||||
PARTITION BY DATE(lpep_pickup_datetime);
|
||||
|
||||
- id: bq_green_table_ext
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE OR REPLACE EXTERNAL TABLE `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}_ext`
|
||||
(
|
||||
VendorID STRING OPTIONS (description = 'A code indicating the LPEP provider that provided the record. 1= Creative Mobile Technologies, LLC; 2= VeriFone Inc.'),
|
||||
lpep_pickup_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was engaged'),
|
||||
lpep_dropoff_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was disengaged'),
|
||||
store_and_fwd_flag STRING OPTIONS (description = 'This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka "store and forward," because the vehicle did not have a connection to the server. Y= store and forward trip N= not a store and forward trip'),
|
||||
RatecodeID STRING OPTIONS (description = 'The final rate code in effect at the end of the trip. 1= Standard rate 2=JFK 3=Newark 4=Nassau or Westchester 5=Negotiated fare 6=Group ride'),
|
||||
PULocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was engaged'),
|
||||
DOLocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was disengaged'),
|
||||
passenger_count INT64 OPTIONS (description = 'The number of passengers in the vehicle. This is a driver-entered value.'),
|
||||
trip_distance NUMERIC OPTIONS (description = 'The elapsed trip distance in miles reported by the taximeter.'),
|
||||
fare_amount NUMERIC OPTIONS (description = 'The time-and-distance fare calculated by the meter'),
|
||||
extra NUMERIC OPTIONS (description = 'Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges'),
|
||||
mta_tax NUMERIC OPTIONS (description = '$0.50 MTA tax that is automatically triggered based on the metered rate in use'),
|
||||
tip_amount NUMERIC OPTIONS (description = 'Tip amount. This field is automatically populated for credit card tips. Cash tips are not included.'),
|
||||
tolls_amount NUMERIC OPTIONS (description = 'Total amount of all tolls paid in trip.'),
|
||||
ehail_fee NUMERIC,
|
||||
improvement_surcharge NUMERIC OPTIONS (description = '$0.30 improvement surcharge assessed on hailed trips at the flag drop. The improvement surcharge began being levied in 2015.'),
|
||||
total_amount NUMERIC OPTIONS (description = 'The total amount charged to passengers. Does not include cash tips.'),
|
||||
payment_type INTEGER OPTIONS (description = 'A numeric code signifying how the passenger paid for the trip. 1= Credit card 2= Cash 3= No charge 4= Dispute 5= Unknown 6= Voided trip'),
|
||||
trip_type STRING OPTIONS (description = 'A code indicating whether the trip was a street-hail or a dispatch that is automatically assigned based on the metered rate in use but can be altered by the driver. 1= Street-hail 2= Dispatch'),
|
||||
congestion_surcharge NUMERIC OPTIONS (description = 'Congestion surcharge applied to trips in congested zones')
|
||||
)
|
||||
OPTIONS (
|
||||
format = 'CSV',
|
||||
uris = ['{{render(vars.gcs_file)}}'],
|
||||
skip_leading_rows = 1,
|
||||
ignore_unknown_values = TRUE
|
||||
);
|
||||
|
||||
- id: bq_green_table_tmp
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE OR REPLACE TABLE `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}`
|
||||
AS
|
||||
SELECT
|
||||
MD5(CONCAT(
|
||||
COALESCE(CAST(VendorID AS STRING), ""),
|
||||
COALESCE(CAST(lpep_pickup_datetime AS STRING), ""),
|
||||
COALESCE(CAST(lpep_dropoff_datetime AS STRING), ""),
|
||||
COALESCE(CAST(PULocationID AS STRING), ""),
|
||||
COALESCE(CAST(DOLocationID AS STRING), "")
|
||||
)) AS unique_row_id,
|
||||
"{{render(vars.file)}}" AS filename,
|
||||
*
|
||||
FROM `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}_ext`;
|
||||
|
||||
- id: bq_green_merge
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
MERGE INTO `{{kv('GCP_PROJECT_ID')}}.{{kv('GCP_DATASET')}}.green_tripdata` T
|
||||
USING `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}` S
|
||||
ON T.unique_row_id = S.unique_row_id
|
||||
WHEN NOT MATCHED THEN
|
||||
INSERT (unique_row_id, filename, VendorID, lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, RatecodeID, PULocationID, DOLocationID, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee, improvement_surcharge, total_amount, payment_type, trip_type, congestion_surcharge)
|
||||
VALUES (S.unique_row_id, S.filename, S.VendorID, S.lpep_pickup_datetime, S.lpep_dropoff_datetime, S.store_and_fwd_flag, S.RatecodeID, S.PULocationID, S.DOLocationID, S.passenger_count, S.trip_distance, S.fare_amount, S.extra, S.mta_tax, S.tip_amount, S.tolls_amount, S.ehail_fee, S.improvement_surcharge, S.total_amount, S.payment_type, S.trip_type, S.congestion_surcharge);
|
||||
|
||||
- id: purge_files
|
||||
type: io.kestra.plugin.core.storage.PurgeCurrentExecutionFiles
|
||||
description: If you'd like to explore Kestra outputs, disable it.
|
||||
disabled: false
|
||||
|
||||
pluginDefaults:
|
||||
- type: io.kestra.plugin.gcp
|
||||
values:
|
||||
serviceAccount: "{{kv('GCP_CREDS')}}"
|
||||
projectId: "{{kv('GCP_PROJECT_ID')}}"
|
||||
location: "{{kv('GCP_LOCATION')}}"
|
||||
bucket: "{{kv('GCP_BUCKET_NAME')}}"
|
||||
249
02-workflow-orchestration/flows/06_gcp_taxi_scheduled.yaml
Normal file
249
02-workflow-orchestration/flows/06_gcp_taxi_scheduled.yaml
Normal file
@ -0,0 +1,249 @@
|
||||
|
||||
id: 06_gcp_taxi_scheduled
|
||||
namespace: zoomcamp
|
||||
description: |
|
||||
Best to add a label `backfill:true` from the UI to track executions created via a backfill.
|
||||
CSV data used here comes from: https://github.com/DataTalksClub/nyc-tlc-data/releases
|
||||
|
||||
inputs:
|
||||
- id: taxi
|
||||
type: SELECT
|
||||
displayName: Select taxi type
|
||||
values: [yellow, green]
|
||||
defaults: green
|
||||
|
||||
variables:
|
||||
file: "{{inputs.taxi}}_tripdata_{{trigger.date | date('yyyy-MM')}}.csv"
|
||||
gcs_file: "gs://{{kv('GCP_BUCKET_NAME')}}/{{vars.file}}"
|
||||
table: "{{kv('GCP_DATASET')}}.{{inputs.taxi}}_tripdata_{{trigger.date | date('yyyy_MM')}}"
|
||||
data: "{{outputs.extract.outputFiles[inputs.taxi ~ '_tripdata_' ~ (trigger.date | date('yyyy-MM')) ~ '.csv']}}"
|
||||
|
||||
tasks:
|
||||
- id: set_label
|
||||
type: io.kestra.plugin.core.execution.Labels
|
||||
labels:
|
||||
file: "{{render(vars.file)}}"
|
||||
taxi: "{{inputs.taxi}}"
|
||||
|
||||
- id: extract
|
||||
type: io.kestra.plugin.scripts.shell.Commands
|
||||
outputFiles:
|
||||
- "*.csv"
|
||||
taskRunner:
|
||||
type: io.kestra.plugin.core.runner.Process
|
||||
commands:
|
||||
- wget -qO- https://github.com/DataTalksClub/nyc-tlc-data/releases/download/{{inputs.taxi}}/{{render(vars.file)}}.gz | gunzip > {{render(vars.file)}}
|
||||
|
||||
- id: upload_to_gcs
|
||||
type: io.kestra.plugin.gcp.gcs.Upload
|
||||
from: "{{render(vars.data)}}"
|
||||
to: "{{render(vars.gcs_file)}}"
|
||||
|
||||
- id: if_yellow_taxi
|
||||
type: io.kestra.plugin.core.flow.If
|
||||
condition: "{{inputs.taxi == 'yellow'}}"
|
||||
then:
|
||||
- id: bq_yellow_tripdata
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS `{{kv('GCP_PROJECT_ID')}}.{{kv('GCP_DATASET')}}.yellow_tripdata`
|
||||
(
|
||||
unique_row_id BYTES OPTIONS (description = 'A unique identifier for the trip, generated by hashing key trip attributes.'),
|
||||
filename STRING OPTIONS (description = 'The source filename from which the trip data was loaded.'),
|
||||
VendorID STRING OPTIONS (description = 'A code indicating the LPEP provider that provided the record. 1= Creative Mobile Technologies, LLC; 2= VeriFone Inc.'),
|
||||
tpep_pickup_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was engaged'),
|
||||
tpep_dropoff_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was disengaged'),
|
||||
passenger_count INTEGER OPTIONS (description = 'The number of passengers in the vehicle. This is a driver-entered value.'),
|
||||
trip_distance NUMERIC OPTIONS (description = 'The elapsed trip distance in miles reported by the taximeter.'),
|
||||
RatecodeID STRING OPTIONS (description = 'The final rate code in effect at the end of the trip. 1= Standard rate 2=JFK 3=Newark 4=Nassau or Westchester 5=Negotiated fare 6=Group ride'),
|
||||
store_and_fwd_flag STRING OPTIONS (description = 'This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka "store and forward," because the vehicle did not have a connection to the server. TRUE = store and forward trip, FALSE = not a store and forward trip'),
|
||||
PULocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was engaged'),
|
||||
DOLocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was disengaged'),
|
||||
payment_type INTEGER OPTIONS (description = 'A numeric code signifying how the passenger paid for the trip. 1= Credit card 2= Cash 3= No charge 4= Dispute 5= Unknown 6= Voided trip'),
|
||||
fare_amount NUMERIC OPTIONS (description = 'The time-and-distance fare calculated by the meter'),
|
||||
extra NUMERIC OPTIONS (description = 'Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges'),
|
||||
mta_tax NUMERIC OPTIONS (description = '$0.50 MTA tax that is automatically triggered based on the metered rate in use'),
|
||||
tip_amount NUMERIC OPTIONS (description = 'Tip amount. This field is automatically populated for credit card tips. Cash tips are not included.'),
|
||||
tolls_amount NUMERIC OPTIONS (description = 'Total amount of all tolls paid in trip.'),
|
||||
improvement_surcharge NUMERIC OPTIONS (description = '$0.30 improvement surcharge assessed on hailed trips at the flag drop. The improvement surcharge began being levied in 2015.'),
|
||||
total_amount NUMERIC OPTIONS (description = 'The total amount charged to passengers. Does not include cash tips.'),
|
||||
congestion_surcharge NUMERIC OPTIONS (description = 'Congestion surcharge applied to trips in congested zones')
|
||||
)
|
||||
PARTITION BY DATE(tpep_pickup_datetime);
|
||||
|
||||
- id: bq_yellow_table_ext
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE OR REPLACE EXTERNAL TABLE `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}_ext`
|
||||
(
|
||||
VendorID STRING OPTIONS (description = 'A code indicating the LPEP provider that provided the record. 1= Creative Mobile Technologies, LLC; 2= VeriFone Inc.'),
|
||||
tpep_pickup_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was engaged'),
|
||||
tpep_dropoff_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was disengaged'),
|
||||
passenger_count INTEGER OPTIONS (description = 'The number of passengers in the vehicle. This is a driver-entered value.'),
|
||||
trip_distance NUMERIC OPTIONS (description = 'The elapsed trip distance in miles reported by the taximeter.'),
|
||||
RatecodeID STRING OPTIONS (description = 'The final rate code in effect at the end of the trip. 1= Standard rate 2=JFK 3=Newark 4=Nassau or Westchester 5=Negotiated fare 6=Group ride'),
|
||||
store_and_fwd_flag STRING OPTIONS (description = 'This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka "store and forward," because the vehicle did not have a connection to the server. TRUE = store and forward trip, FALSE = not a store and forward trip'),
|
||||
PULocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was engaged'),
|
||||
DOLocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was disengaged'),
|
||||
payment_type INTEGER OPTIONS (description = 'A numeric code signifying how the passenger paid for the trip. 1= Credit card 2= Cash 3= No charge 4= Dispute 5= Unknown 6= Voided trip'),
|
||||
fare_amount NUMERIC OPTIONS (description = 'The time-and-distance fare calculated by the meter'),
|
||||
extra NUMERIC OPTIONS (description = 'Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges'),
|
||||
mta_tax NUMERIC OPTIONS (description = '$0.50 MTA tax that is automatically triggered based on the metered rate in use'),
|
||||
tip_amount NUMERIC OPTIONS (description = 'Tip amount. This field is automatically populated for credit card tips. Cash tips are not included.'),
|
||||
tolls_amount NUMERIC OPTIONS (description = 'Total amount of all tolls paid in trip.'),
|
||||
improvement_surcharge NUMERIC OPTIONS (description = '$0.30 improvement surcharge assessed on hailed trips at the flag drop. The improvement surcharge began being levied in 2015.'),
|
||||
total_amount NUMERIC OPTIONS (description = 'The total amount charged to passengers. Does not include cash tips.'),
|
||||
congestion_surcharge NUMERIC OPTIONS (description = 'Congestion surcharge applied to trips in congested zones')
|
||||
)
|
||||
OPTIONS (
|
||||
format = 'CSV',
|
||||
uris = ['{{render(vars.gcs_file)}}'],
|
||||
skip_leading_rows = 1,
|
||||
ignore_unknown_values = TRUE
|
||||
);
|
||||
|
||||
- id: bq_yellow_table_tmp
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE OR REPLACE TABLE `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}`
|
||||
AS
|
||||
SELECT
|
||||
MD5(CONCAT(
|
||||
COALESCE(CAST(VendorID AS STRING), ""),
|
||||
COALESCE(CAST(tpep_pickup_datetime AS STRING), ""),
|
||||
COALESCE(CAST(tpep_dropoff_datetime AS STRING), ""),
|
||||
COALESCE(CAST(PULocationID AS STRING), ""),
|
||||
COALESCE(CAST(DOLocationID AS STRING), "")
|
||||
)) AS unique_row_id,
|
||||
"{{render(vars.file)}}" AS filename,
|
||||
*
|
||||
FROM `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}_ext`;
|
||||
|
||||
- id: bq_yellow_merge
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
MERGE INTO `{{kv('GCP_PROJECT_ID')}}.{{kv('GCP_DATASET')}}.yellow_tripdata` T
|
||||
USING `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}` S
|
||||
ON T.unique_row_id = S.unique_row_id
|
||||
WHEN NOT MATCHED THEN
|
||||
INSERT (unique_row_id, filename, VendorID, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, RatecodeID, store_and_fwd_flag, PULocationID, DOLocationID, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, congestion_surcharge)
|
||||
VALUES (S.unique_row_id, S.filename, S.VendorID, S.tpep_pickup_datetime, S.tpep_dropoff_datetime, S.passenger_count, S.trip_distance, S.RatecodeID, S.store_and_fwd_flag, S.PULocationID, S.DOLocationID, S.payment_type, S.fare_amount, S.extra, S.mta_tax, S.tip_amount, S.tolls_amount, S.improvement_surcharge, S.total_amount, S.congestion_surcharge);
|
||||
|
||||
- id: if_green_taxi
|
||||
type: io.kestra.plugin.core.flow.If
|
||||
condition: "{{inputs.taxi == 'green'}}"
|
||||
then:
|
||||
- id: bq_green_tripdata
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE TABLE IF NOT EXISTS `{{kv('GCP_PROJECT_ID')}}.{{kv('GCP_DATASET')}}.green_tripdata`
|
||||
(
|
||||
unique_row_id BYTES OPTIONS (description = 'A unique identifier for the trip, generated by hashing key trip attributes.'),
|
||||
filename STRING OPTIONS (description = 'The source filename from which the trip data was loaded.'),
|
||||
VendorID STRING OPTIONS (description = 'A code indicating the LPEP provider that provided the record. 1= Creative Mobile Technologies, LLC; 2= VeriFone Inc.'),
|
||||
lpep_pickup_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was engaged'),
|
||||
lpep_dropoff_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was disengaged'),
|
||||
store_and_fwd_flag STRING OPTIONS (description = 'This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka "store and forward," because the vehicle did not have a connection to the server. Y= store and forward trip N= not a store and forward trip'),
|
||||
RatecodeID STRING OPTIONS (description = 'The final rate code in effect at the end of the trip. 1= Standard rate 2=JFK 3=Newark 4=Nassau or Westchester 5=Negotiated fare 6=Group ride'),
|
||||
PULocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was engaged'),
|
||||
DOLocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was disengaged'),
|
||||
passenger_count INT64 OPTIONS (description = 'The number of passengers in the vehicle. This is a driver-entered value.'),
|
||||
trip_distance NUMERIC OPTIONS (description = 'The elapsed trip distance in miles reported by the taximeter.'),
|
||||
fare_amount NUMERIC OPTIONS (description = 'The time-and-distance fare calculated by the meter'),
|
||||
extra NUMERIC OPTIONS (description = 'Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges'),
|
||||
mta_tax NUMERIC OPTIONS (description = '$0.50 MTA tax that is automatically triggered based on the metered rate in use'),
|
||||
tip_amount NUMERIC OPTIONS (description = 'Tip amount. This field is automatically populated for credit card tips. Cash tips are not included.'),
|
||||
tolls_amount NUMERIC OPTIONS (description = 'Total amount of all tolls paid in trip.'),
|
||||
ehail_fee NUMERIC,
|
||||
improvement_surcharge NUMERIC OPTIONS (description = '$0.30 improvement surcharge assessed on hailed trips at the flag drop. The improvement surcharge began being levied in 2015.'),
|
||||
total_amount NUMERIC OPTIONS (description = 'The total amount charged to passengers. Does not include cash tips.'),
|
||||
payment_type INTEGER OPTIONS (description = 'A numeric code signifying how the passenger paid for the trip. 1= Credit card 2= Cash 3= No charge 4= Dispute 5= Unknown 6= Voided trip'),
|
||||
trip_type STRING OPTIONS (description = 'A code indicating whether the trip was a street-hail or a dispatch that is automatically assigned based on the metered rate in use but can be altered by the driver. 1= Street-hail 2= Dispatch'),
|
||||
congestion_surcharge NUMERIC OPTIONS (description = 'Congestion surcharge applied to trips in congested zones')
|
||||
)
|
||||
PARTITION BY DATE(lpep_pickup_datetime);
|
||||
|
||||
- id: bq_green_table_ext
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE OR REPLACE EXTERNAL TABLE `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}_ext`
|
||||
(
|
||||
VendorID STRING OPTIONS (description = 'A code indicating the LPEP provider that provided the record. 1= Creative Mobile Technologies, LLC; 2= VeriFone Inc.'),
|
||||
lpep_pickup_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was engaged'),
|
||||
lpep_dropoff_datetime TIMESTAMP OPTIONS (description = 'The date and time when the meter was disengaged'),
|
||||
store_and_fwd_flag STRING OPTIONS (description = 'This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka "store and forward," because the vehicle did not have a connection to the server. Y= store and forward trip N= not a store and forward trip'),
|
||||
RatecodeID STRING OPTIONS (description = 'The final rate code in effect at the end of the trip. 1= Standard rate 2=JFK 3=Newark 4=Nassau or Westchester 5=Negotiated fare 6=Group ride'),
|
||||
PULocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was engaged'),
|
||||
DOLocationID STRING OPTIONS (description = 'TLC Taxi Zone in which the taximeter was disengaged'),
|
||||
passenger_count INT64 OPTIONS (description = 'The number of passengers in the vehicle. This is a driver-entered value.'),
|
||||
trip_distance NUMERIC OPTIONS (description = 'The elapsed trip distance in miles reported by the taximeter.'),
|
||||
fare_amount NUMERIC OPTIONS (description = 'The time-and-distance fare calculated by the meter'),
|
||||
extra NUMERIC OPTIONS (description = 'Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges'),
|
||||
mta_tax NUMERIC OPTIONS (description = '$0.50 MTA tax that is automatically triggered based on the metered rate in use'),
|
||||
tip_amount NUMERIC OPTIONS (description = 'Tip amount. This field is automatically populated for credit card tips. Cash tips are not included.'),
|
||||
tolls_amount NUMERIC OPTIONS (description = 'Total amount of all tolls paid in trip.'),
|
||||
ehail_fee NUMERIC,
|
||||
improvement_surcharge NUMERIC OPTIONS (description = '$0.30 improvement surcharge assessed on hailed trips at the flag drop. The improvement surcharge began being levied in 2015.'),
|
||||
total_amount NUMERIC OPTIONS (description = 'The total amount charged to passengers. Does not include cash tips.'),
|
||||
payment_type INTEGER OPTIONS (description = 'A numeric code signifying how the passenger paid for the trip. 1= Credit card 2= Cash 3= No charge 4= Dispute 5= Unknown 6= Voided trip'),
|
||||
trip_type STRING OPTIONS (description = 'A code indicating whether the trip was a street-hail or a dispatch that is automatically assigned based on the metered rate in use but can be altered by the driver. 1= Street-hail 2= Dispatch'),
|
||||
congestion_surcharge NUMERIC OPTIONS (description = 'Congestion surcharge applied to trips in congested zones')
|
||||
)
|
||||
OPTIONS (
|
||||
format = 'CSV',
|
||||
uris = ['{{render(vars.gcs_file)}}'],
|
||||
skip_leading_rows = 1,
|
||||
ignore_unknown_values = TRUE
|
||||
);
|
||||
|
||||
- id: bq_green_table_tmp
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
CREATE OR REPLACE TABLE `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}`
|
||||
AS
|
||||
SELECT
|
||||
MD5(CONCAT(
|
||||
COALESCE(CAST(VendorID AS STRING), ""),
|
||||
COALESCE(CAST(lpep_pickup_datetime AS STRING), ""),
|
||||
COALESCE(CAST(lpep_dropoff_datetime AS STRING), ""),
|
||||
COALESCE(CAST(PULocationID AS STRING), ""),
|
||||
COALESCE(CAST(DOLocationID AS STRING), "")
|
||||
)) AS unique_row_id,
|
||||
"{{render(vars.file)}}" AS filename,
|
||||
*
|
||||
FROM `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}_ext`;
|
||||
|
||||
- id: bq_green_merge
|
||||
type: io.kestra.plugin.gcp.bigquery.Query
|
||||
sql: |
|
||||
MERGE INTO `{{kv('GCP_PROJECT_ID')}}.{{kv('GCP_DATASET')}}.green_tripdata` T
|
||||
USING `{{kv('GCP_PROJECT_ID')}}.{{render(vars.table)}}` S
|
||||
ON T.unique_row_id = S.unique_row_id
|
||||
WHEN NOT MATCHED THEN
|
||||
INSERT (unique_row_id, filename, VendorID, lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, RatecodeID, PULocationID, DOLocationID, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee, improvement_surcharge, total_amount, payment_type, trip_type, congestion_surcharge)
|
||||
VALUES (S.unique_row_id, S.filename, S.VendorID, S.lpep_pickup_datetime, S.lpep_dropoff_datetime, S.store_and_fwd_flag, S.RatecodeID, S.PULocationID, S.DOLocationID, S.passenger_count, S.trip_distance, S.fare_amount, S.extra, S.mta_tax, S.tip_amount, S.tolls_amount, S.ehail_fee, S.improvement_surcharge, S.total_amount, S.payment_type, S.trip_type, S.congestion_surcharge);
|
||||
|
||||
- id: purge_files
|
||||
type: io.kestra.plugin.core.storage.PurgeCurrentExecutionFiles
|
||||
description: To avoid cluttering your storage, we will remove the downloaded files
|
||||
|
||||
pluginDefaults:
|
||||
- type: io.kestra.plugin.gcp
|
||||
values:
|
||||
serviceAccount: "{{kv('GCP_CREDS')}}"
|
||||
projectId: "{{kv('GCP_PROJECT_ID')}}"
|
||||
location: "{{kv('GCP_LOCATION')}}"
|
||||
bucket: "{{kv('GCP_BUCKET_NAME')}}"
|
||||
|
||||
triggers:
|
||||
- id: green_schedule
|
||||
type: io.kestra.plugin.core.trigger.Schedule
|
||||
cron: "0 9 1 * *"
|
||||
inputs:
|
||||
taxi: green
|
||||
|
||||
- id: yellow_schedule
|
||||
type: io.kestra.plugin.core.trigger.Schedule
|
||||
cron: "0 10 1 * *"
|
||||
inputs:
|
||||
taxi: yellow
|
||||
62
02-workflow-orchestration/flows/07_gcp_dbt.yaml
Normal file
62
02-workflow-orchestration/flows/07_gcp_dbt.yaml
Normal file
@ -0,0 +1,62 @@
|
||||
id: 07_gcp_dbt
|
||||
namespace: zoomcamp
|
||||
inputs:
|
||||
- id: dbt_command
|
||||
type: SELECT
|
||||
allowCustomValue: true
|
||||
defaults: dbt build
|
||||
values:
|
||||
- dbt build
|
||||
- dbt debug # use when running the first time to validate DB connection
|
||||
|
||||
tasks:
|
||||
- id: sync
|
||||
type: io.kestra.plugin.git.SyncNamespaceFiles
|
||||
url: https://github.com/DataTalksClub/data-engineering-zoomcamp
|
||||
branch: main
|
||||
namespace: "{{flow.namespace}}"
|
||||
gitDirectory: 04-analytics-engineering/taxi_rides_ny
|
||||
dryRun: false
|
||||
# disabled: true # this Git Sync is needed only when running it the first time, afterwards the task can be disabled
|
||||
|
||||
- id: dbt-build
|
||||
type: io.kestra.plugin.dbt.cli.DbtCLI
|
||||
env:
|
||||
DBT_DATABASE: "{{kv('GCP_PROJECT_ID')}}"
|
||||
DBT_SCHEMA: "{{kv('GCP_DATASET')}}"
|
||||
namespaceFiles:
|
||||
enabled: true
|
||||
containerImage: ghcr.io/kestra-io/dbt-bigquery:latest
|
||||
taskRunner:
|
||||
type: io.kestra.plugin.scripts.runner.docker.Docker
|
||||
inputFiles:
|
||||
sa.json: "{{kv('GCP_CREDS')}}"
|
||||
commands:
|
||||
- dbt deps
|
||||
- "{{ inputs.dbt_command }}"
|
||||
storeManifest:
|
||||
key: manifest.json
|
||||
namespace: "{{ flow.namespace }}"
|
||||
profiles: |
|
||||
default:
|
||||
outputs:
|
||||
dev:
|
||||
type: bigquery
|
||||
dataset: "{{kv('GCP_DATASET')}}"
|
||||
project: "{{kv('GCP_PROJECT_ID')}}"
|
||||
location: "{{kv('GCP_LOCATION')}}"
|
||||
keyfile: sa.json
|
||||
method: service-account
|
||||
priority: interactive
|
||||
threads: 16
|
||||
timeout_seconds: 300
|
||||
fixed_retries: 1
|
||||
target: dev
|
||||
description: |
|
||||
Note that you need to adjust the models/staging/schema.yml file to match your database and schema. Select and edit that Namespace File from the UI. Save and run this flow. Once https://github.com/DataTalksClub/data-engineering-zoomcamp/pull/565/files is merged, you can ignore this note as it will be dynamically adjusted based on env variables.
|
||||
```yaml
|
||||
sources:
|
||||
- name: staging
|
||||
database: kestra-sandbox
|
||||
schema: zoomcamp
|
||||
```
|
||||
57
02-workflow-orchestration/homework.md
Normal file
57
02-workflow-orchestration/homework.md
Normal file
@ -0,0 +1,57 @@
|
||||
## Module 2 Homework
|
||||
|
||||
### Assignment
|
||||
|
||||
So far in the course, we processed data for the year 2019 and 2020. Your task is to extend the existing flows to include data for the year 2021.
|
||||
|
||||

|
||||
|
||||
As a hint, Kestra makes that process really easy:
|
||||
1. You can leverage the backfill functionality in the [scheduled flow](../flows/07_gcp_taxi_scheduled.yaml) to backfill the data for the year 2021. Just make sure to select the time period for which data exists i.e. from `2021-01-01` to `2021-07-31`. Also, make sure to do the same for both `yellow` and `green` taxi data (select the right service in the `taxi` input).
|
||||
2. Alternatively, run the flow manually for each of the seven months of 2021 for both `yellow` and `green` taxi data. Challenge for you: find out how to loop over the combination of Year-Month and `taxi`-type using `ForEach` task which triggers the flow for each combination using a `Subflow` task.
|
||||
|
||||
### Quiz Questions
|
||||
|
||||
Complete the Quiz shown below. It’s a set of 6 multiple-choice questions to test your understanding of workflow orchestration, Kestra and ETL pipelines for data lakes and warehouses.
|
||||
|
||||
1) Within the execution for `Yellow` Taxi data for the year `2020` and month `12`: what is the uncompressed file size (i.e. the output file `yellow_tripdata_2020-12.csv` of the `extract` task)?
|
||||
- 128.3 MB
|
||||
- 134.5 MB
|
||||
- 364.7 MB
|
||||
- 692.6 MB
|
||||
|
||||
2) What is the value of the variable `file` when the inputs `taxi` is set to `green`, `year` is set to `2020`, and `month` is set to `04` during execution?
|
||||
- `{{inputs.taxi}}_tripdata_{{inputs.year}}-{{inputs.month}}.csv`
|
||||
- `green_tripdata_2020-04.csv`
|
||||
- `green_tripdata_04_2020.csv`
|
||||
- `green_tripdata_2020.csv`
|
||||
|
||||
3) How many rows are there for the `Yellow` Taxi data for the year 2020?
|
||||
- 13,537.299
|
||||
- 24,648,499
|
||||
- 18,324,219
|
||||
- 29,430,127
|
||||
|
||||
4) How many rows are there for the `Green` Taxi data for the year 2020?
|
||||
- 5,327,301
|
||||
- 936,199
|
||||
- 1,734,051
|
||||
- 1,342,034
|
||||
|
||||
5) Using dbt on the `Green` and `Yellow` Taxi data for the year 2020, how many rows are there in the `fact_trips` table?
|
||||
- 198
|
||||
- 165
|
||||
- 151
|
||||
- 203
|
||||
|
||||
6) How would you configure the timezone to New York in a Schedule trigger?
|
||||
- Add a `timezone` property set to `EST` in the `Schedule` trigger configuration
|
||||
- Add a `timezone` property set to `America/New_York` in the `Schedule` trigger configuration
|
||||
- Add a `timezone` property set to `UTC-5` in the `Schedule` trigger configuration
|
||||
- Add a `location` property set to `New_York` in the `Schedule` trigger configuration
|
||||
|
||||
|
||||
## Submitting the solutions
|
||||
|
||||
* Form for submitting: https://courses.datatalks.club/de-zoomcamp-2025/homework/hw2
|
||||
* Check the link above to see the due date
|
||||
BIN
02-workflow-orchestration/images/homework.png
Normal file
BIN
02-workflow-orchestration/images/homework.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 716 KiB |
15
02-workflow-orchestration/postgres/docker-compose.yml
Normal file
15
02-workflow-orchestration/postgres/docker-compose.yml
Normal file
@ -0,0 +1,15 @@
|
||||
version: "3.8"
|
||||
services:
|
||||
postgres:
|
||||
image: postgres
|
||||
container_name: postgres-db
|
||||
environment:
|
||||
POSTGRES_USER: kestra
|
||||
POSTGRES_PASSWORD: k3str4
|
||||
POSTGRES_DB: postgres-zoomcamp
|
||||
ports:
|
||||
- "5432:5432"
|
||||
volumes:
|
||||
- postgres-data:/var/lib/postgresql/data
|
||||
volumes:
|
||||
postgres-data:
|
||||
@ -1,4 +1,4 @@
|
||||
# Week 4: Analytics Engineering
|
||||
# Module 4: Analytics Engineering
|
||||
Goal: Transforming the data loaded in DWH into Analytical Views developing a [dbt project](taxi_rides_ny/README.md).
|
||||
|
||||
### Prerequisites
|
||||
|
||||
@ -2,11 +2,8 @@ version: 2
|
||||
|
||||
sources:
|
||||
- name: staging
|
||||
database: taxi-rides-ny-339813-412521
|
||||
# For postgres:
|
||||
#database: production
|
||||
schema: trips_data_all
|
||||
|
||||
database: "{{ env_var('DBT_DATABASE', 'taxi-rides-ny-339813-412521') }}"
|
||||
schema: "{{ env_var('DBT_SCHEMA', 'trips_data_all') }}"
|
||||
# loaded_at_field: record_loaded_at
|
||||
tables:
|
||||
- name: green_tripdata
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
# Week 5: Batch Processing
|
||||
# Module 5: Batch Processing
|
||||
|
||||
## 5.1 Introduction
|
||||
|
||||
|
||||
@ -54,7 +54,7 @@ print(f'The PySpark {spark.version} version is running...')
|
||||
1. Install Scala
|
||||
|
||||
```bash
|
||||
brew install scala@2.11
|
||||
brew install scala@2.13
|
||||
```
|
||||
|
||||
2. Install Apache Spark
|
||||
|
||||
@ -20,13 +20,21 @@ For example, if the file under `${SPARK_HOME}/python/lib/` is `py4j-0.10.9.3-src
|
||||
export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.3-src.zip:$PYTHONPATH"
|
||||
```
|
||||
|
||||
On Windows, you may have to do path conversion from unix-style to windowns-style:
|
||||
|
||||
```bash
|
||||
SPARK_WIN=`cygpath -w ${SPARK_HOME}`
|
||||
|
||||
export PYTHONPATH="${SPARK_WIN}\\python\\"
|
||||
export PYTHONPATH="${SPARK_WIN}\\python\\lib\\py4j-0.10.9-src.zip;$PYTHONPATH"
|
||||
```
|
||||
|
||||
Now you can run Jupyter or IPython to test if things work. Go to some other directory, e.g. `~/tmp`.
|
||||
|
||||
Download a CSV file that we'll use for testing:
|
||||
|
||||
```bash
|
||||
wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv
|
||||
wget https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv
|
||||
```
|
||||
|
||||
Now let's run `ipython` (or `jupyter notebook`) and execute:
|
||||
@ -42,7 +50,7 @@ spark = SparkSession.builder \
|
||||
|
||||
df = spark.read \
|
||||
.option("header", "true") \
|
||||
.csv('taxi+_zone_lookup.csv')
|
||||
.csv('taxi_zone_lookup.csv')
|
||||
|
||||
df.show()
|
||||
```
|
||||
|
||||
@ -56,6 +56,19 @@ for FILE in ${FILES}; do
|
||||
done
|
||||
```
|
||||
|
||||
If you don't have wget, you can use curl:
|
||||
|
||||
```bash
|
||||
HADOOP_VERSION="3.2.0"
|
||||
PREFIX="https://raw.githubusercontent.com/cdarlint/winutils/master/hadoop-${HADOOP_VERSION}/bin/"
|
||||
|
||||
FILES="hadoop.dll hadoop.exp hadoop.lib hadoop.pdb libwinutils.lib winutils.exe winutils.pdb"
|
||||
|
||||
for FILE in ${FILES}; do
|
||||
curl -o "${FILE}" "${PREFIX}/${FILE}";
|
||||
done
|
||||
```
|
||||
|
||||
Add it to `PATH`:
|
||||
|
||||
```bash
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
# Week 6: Stream Processing
|
||||
# Module 6: Stream Processing
|
||||
|
||||
# Code structure
|
||||
* [Java examples](java)
|
||||
@ -116,7 +116,7 @@ Please follow the steps described under [pyspark-streaming](python/streams-examp
|
||||
|
||||
## Homework
|
||||
|
||||
* [2024 Homework](../cohorts/2024/)
|
||||
* [2024 Homework](../cohorts/2024/06-streaming/homework.md)
|
||||
|
||||
## Community notes
|
||||
|
||||
@ -125,5 +125,7 @@ Did you take notes? You can share them here.
|
||||
* [Notes by Alvaro Navas](https://github.com/ziritrion/dataeng-zoomcamp/blob/main/notes/6_streaming.md )
|
||||
* [Marcos Torregrosa's blog (spanish)](https://www.n4gash.com/2023/data-engineering-zoomcamp-semana-6-stream-processing/)
|
||||
* [Notes by Oscar Garcia](https://github.com/ozkary/Data-Engineering-Bootcamp/tree/main/Step6-Streaming)
|
||||
* [2024 videos transcript](https://drive.google.com/drive/folders/1UngeL5FM-GcDLM7QYaDTKb3jIS6CQC14?usp=drive_link) by Maria Fisher
|
||||
* [Notes by Shayan Shafiee Moghadam](https://github.com/shayansm2/eng-notebook/blob/main/kafka/readme.md)
|
||||
* Add your notes here (above this line)
|
||||
|
||||
|
||||
47
README.md
47
README.md
@ -20,18 +20,15 @@ Syllabus
|
||||
* [Module 4: Analytics Engineering](#module-4-analytics-engineering)
|
||||
* [Module 5: Batch processing](#module-5-batch-processing)
|
||||
* [Module 6: Streaming](#module-6-streaming)
|
||||
* [Workshop 2: Stream Processing with SQL](#workshop-2-stream-processing-with-sql)
|
||||
* [Project](#project)
|
||||
|
||||
## Taking the course
|
||||
|
||||
### 2024 Cohort
|
||||
### 2025 Cohort
|
||||
|
||||
* **Start**: 15 January 2024 (Monday) at 17:00 CET
|
||||
* **Start**: 13 January 2025
|
||||
* **Registration link**: https://airtable.com/shr6oVXeQvSI5HuWD
|
||||
* [Cohort folder](cohorts/2024/) with homeworks and deadlines
|
||||
* [Launch stream with course overview](https://www.youtube.com/live/AtRhA-NfS24?si=5JzA_E8BmJjiLi8l)
|
||||
|
||||
* Materials specific to the cohort: [cohorts/2025/](cohorts/2025/)
|
||||
|
||||
### Self-paced mode
|
||||
|
||||
@ -46,6 +43,8 @@ can take the course at your own pace
|
||||
|
||||
## Syllabus
|
||||
|
||||
We encourage [Learning in Public](learning-in-public.md)
|
||||
|
||||
> **Note:** NYC TLC changed the format of the data we use to parquet.
|
||||
> In the course we still use the CSV files accessible [here](https://github.com/DataTalksClub/nyc-tlc-data).
|
||||
|
||||
@ -67,13 +66,13 @@ can take the course at your own pace
|
||||
|
||||
* Data Lake
|
||||
* Workflow orchestration
|
||||
* Workflow orchestration with Mage
|
||||
* Workflow orchestration with Kestra
|
||||
* Homework
|
||||
|
||||
[More details](02-workflow-orchestration/)
|
||||
|
||||
|
||||
### [Workshop 1: Data Ingestion](cohorts/2024/workshops/dlt.md)
|
||||
### [Workshop 1: Data Ingestion](cohorts/2025/workshops/dlt.md)
|
||||
|
||||
* Reading from apis
|
||||
* Building scalable pipelines
|
||||
@ -82,7 +81,7 @@ can take the course at your own pace
|
||||
* Homework
|
||||
|
||||
|
||||
[More details](cohorts/2024/workshops/dlt.md)
|
||||
[More details](cohorts/2025/workshops/dlt.md)
|
||||
|
||||
|
||||
### [Module 3: Data Warehouse](03-data-warehouse/)
|
||||
@ -132,11 +131,6 @@ can take the course at your own pace
|
||||
[More details](06-streaming/)
|
||||
|
||||
|
||||
### [Workshop 2: Stream Processing with SQL](cohorts/2024/workshops/rising-wave.md)
|
||||
|
||||
|
||||
[More details](cohorts/2024/workshops/rising-wave.md)
|
||||
|
||||
|
||||
### [Project](projects)
|
||||
|
||||
@ -149,7 +143,7 @@ Putting everything we learned to practice
|
||||
|
||||
## Overview
|
||||
|
||||
<img src="images/architecture/arch_v3_workshops.jpg" />
|
||||
<img src="images/architecture/arch_v4_workshops.jpg" />
|
||||
|
||||
### Prerequisites
|
||||
|
||||
@ -163,17 +157,21 @@ Prior experience with data engineering is not required.
|
||||
|
||||
## Instructors
|
||||
|
||||
- [Ankush Khanna](https://linkedin.com/in/ankushkhanna2)
|
||||
- [Victoria Perez Mola](https://www.linkedin.com/in/victoriaperezmola/)
|
||||
- [Alexey Grigorev](https://linkedin.com/in/agrigorev)
|
||||
- [Matt Palmer](https://www.linkedin.com/in/matt-palmer/)
|
||||
- [Luis Oliveira](https://www.linkedin.com/in/lgsoliveira/)
|
||||
- [Michael Shoemaker](https://www.linkedin.com/in/michaelshoemaker1/)
|
||||
- [Zach Wilson](https://www.linkedin.com/in/eczachly)
|
||||
- [Will Russell](https://www.linkedin.com/in/wrussell1999/)
|
||||
- [Anna Geller](https://www.linkedin.com/in/anna-geller-12a86811a/)
|
||||
|
||||
|
||||
|
||||
Past instructors:
|
||||
|
||||
- [Ankush Khanna](https://linkedin.com/in/ankushkhanna2)
|
||||
- [Sejal Vaidya](https://www.linkedin.com/in/vaidyasejal/)
|
||||
- [Irem Erturk](https://www.linkedin.com/in/iremerturk/)
|
||||
- [Luis Oliveira](https://www.linkedin.com/in/lgsoliveira/)
|
||||
|
||||
|
||||
## Asking for help in Slack
|
||||
@ -192,8 +190,8 @@ To make discussions in Slack more organized:
|
||||
Thanks to the course sponsors for making it possible to run this course
|
||||
|
||||
<p align="center">
|
||||
<a href="https://mage.ai/">
|
||||
<img height="120" src="images/mage.svg">
|
||||
<a href="https://kestra.io/">
|
||||
<img height="120" src="images/kestra.svg">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
@ -204,14 +202,5 @@ Thanks to the course sponsors for making it possible to run this course
|
||||
</a>
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://risingwave.com/">
|
||||
<img height="90" src="images/rising-wave.png">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
Do you want to support our course and our community? Please reach out to [alexey@datatalks.club](alexey@datatalks.club)
|
||||
|
||||
## Star History
|
||||
|
||||
[](https://star-history.com/#DataTalksClub/data-engineering-zoomcamp&Date)
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
## Thank you!
|
||||
|
||||
Thanks for signining up for the course.
|
||||
Thanks for signing up for the course.
|
||||
|
||||
The process of adding you to the mailing list is not automated yet,
|
||||
but you will hear from us closer to the course start.
|
||||
|
||||
@ -15,7 +15,7 @@ To keep our discussion in Slack more organized, we ask you to follow these sugge
|
||||
|
||||
### How to troubleshoot issues
|
||||
|
||||
The first step is to try to solve the issue on you own; get use to solving problems. This will be a real life skill you need when employeed.
|
||||
The first step is to try to solve the issue on you own; get used to solving problems. This will be a real life skill you need when employeed.
|
||||
|
||||
1. What does the error say? There will often be a description of the error or instructions on what is needed, I have even seen a link to the solution. Does it reference a specific line of your code?
|
||||
2. Restart the application or server/pc.
|
||||
@ -33,12 +33,12 @@ The first step is to try to solve the issue on you own; get use to solving probl
|
||||
* Before asking a question, check the [FAQ](https://docs.google.com/document/d/19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw/edit).
|
||||
* DO NOT use screenshots, especially don’t take pictures from a phone.
|
||||
* DO NOT tag instructors, it may discourage others from helping you.
|
||||
* Copy and past errors; if it’s long, just post it in a reply to your thread.
|
||||
* Copy and paste errors; if it’s long, just post it in a reply to your thread.
|
||||
* Use ``` for formatting your code.
|
||||
* Use the same thread for the conversation (that means reply to your own thread).
|
||||
* DO NOT create multiple posts to discus the issue.
|
||||
* Use the same thread for the conversation (that means replying to your own thread).
|
||||
* DO NOT create multiple posts to discuss the issue.
|
||||
* You may create a new post if the issue reemerges down the road. Be sure to describe what has changed in the environment.
|
||||
* Provide addition information in the same thread of the steps you have taken for resolution.
|
||||
* Provide additional information in the same thread of the steps you have taken for resolution.
|
||||
|
||||
|
||||
|
||||
|
||||
181
awesome-data-engineering.md
Normal file
181
awesome-data-engineering.md
Normal file
@ -0,0 +1,181 @@
|
||||
Have you found any cool resources about data engineering? Put them here
|
||||
|
||||
## Learning Data Engineering
|
||||
|
||||
### Courses
|
||||
|
||||
* [Data Engineering Zoomcamp](https://github.com/DataTalksClub/data-engineering-zoomcamp) by DataTalks.Club (free)
|
||||
* [Big Data Platforms, Autumn 2022: Introduction to Big Data Processing Frameworks](https://big-data-platforms-22.mooc.fi/) by the University of Helsinki (free)
|
||||
* [Awesome Data Engineering Learning Path](https://awesomedataengineering.com/)
|
||||
|
||||
|
||||
### Books
|
||||
|
||||
* [Designing Data-Intensive Applications: The Big Ideas Behind Reliable, Scalable, and Maintainable Systems by Martin Kleppmann](https://www.amazon.com/Designing-Data-Intensive-Applications-Reliable-Maintainable/dp/1449373321)
|
||||
* [Big Data: Principles and Best Practices of Scalable Realtime Data Systems by Nathan Marz, James Warren](https://www.amazon.com/Big-Data-Principles-practices-scalable/dp/1617290343)
|
||||
* [Practical DataOps: Delivering Agile Data Science at Scale by Harvinder Atwal](https://www.amazon.com/Practical-DataOps-Delivering-Agile-Science/dp/1484251032)
|
||||
* [Data Pipelines Pocket Reference: Moving and Processing Data for Analytics by James Densmore](https://www.amazon.com/Data-Pipelines-Pocket-Reference-Processing/dp/1492087831)
|
||||
* [Best books for data engineering](https://awesomedataengineering.com/data_engineering_best_books)
|
||||
* [Fundamentals of Data Engineering: Plan and Build Robust Data Systems by Joe Reis, Matt Housley](https://www.amazon.com/Fundamentals-Data-Engineering-Robust-Systems/dp/1098108302)
|
||||
|
||||
|
||||
### Introduction to Data Engineering Terms
|
||||
|
||||
* [https://datatalks.club/podcast/s05e02-data-engineering-acronyms.html](https://datatalks.club/podcast/s05e02-data-engineering-acronyms.html)
|
||||
|
||||
|
||||
### Data engineering in practice
|
||||
|
||||
Conference talks from companies, blog posts, etc
|
||||
|
||||
* [Uber Data Archives](https://eng.uber.com/category/articles/uberdata/) (Uber engineering blog)
|
||||
* [Data Engineering Weekly (DE-focused substack)](https://www.dataengineeringweekly.com/)
|
||||
* [Seattle Data Guy (DE-focused substack)](https://seattledataguy.substack.com/)
|
||||
|
||||
|
||||
## Doing Data Engineering
|
||||
|
||||
### Coding & Python
|
||||
|
||||
* [CS50's Introduction to Computer Science | edX](https://www.edx.org/course/introduction-computer-science-harvardx-cs50x) (course)
|
||||
* [Python for Everybody SpecializsationSpecialization](https://www.coursera.org/specializations/python) (course)
|
||||
* [Practical Python programming](https://github.com/dabeaz-course/practical-python/blob/master/Notes/Contents.md)
|
||||
|
||||
|
||||
### SQL
|
||||
|
||||
* [Intro to SQL: Querying and managing data | Khan Academy](https://www.khanacademy.org/computing/computer-programming/sql)
|
||||
* [Mode SQL Tutorial](https://mode.com/sql-tutorial/)
|
||||
* [Use The Index, Luke](https://use-the-index-luke.com/) (SQL Indexing a nd Tuning e-Book)nfreffx
|
||||
* [SQL Performance Explained](https://sql-performance-explained.com/) (book) e
|
||||
|
||||
|
||||
### Workflow orchestration
|
||||
|
||||
* [What is DAG?](https://youtu.be/1Yh5S-S6wsI) (video)
|
||||
* [Airflow, Prefect, and Dagster: An Inside Look](https://towardsdatascience.com/airflow-prefect-and-dagster-an-inside-look-6074781c9b77) (blog post)
|
||||
* [Open-Source Spotlight - Prefect - Kevin Kho](https://www.youtube.com/watch?v=ISLV9JyqF1w) (video)
|
||||
* [Prefect as a Data Engineering Project Workflow Tool, with Mary Clair Thompson (Duke) - 11/6/2020](https://youtu.be/HuwA4wLQtCM) (video)
|
||||
|
||||
|
||||
### ETL and ELT
|
||||
|
||||
* [ETL vs. ELT: What’s the Difference?](https://rivery.io/blog/etl-vs-elt/) (blog post) (print version)
|
||||
|
||||
### Data lakes
|
||||
|
||||
* [An Introduction to Modern Data Lake Storage Layers (Hodi, Iceberg, Delta Lake)](https://dacort.dev/posts/modern-data-lake-storage-layers/) (blog post)
|
||||
* [Lake House Architecture @ Halodoc: Data Platform 2.0](https://blogs.halodoc.io/lake-house-architecture-halodoc-data-platform-2-0/amp/) (blzog post)
|
||||
|
||||
|
||||
### Data warehousing
|
||||
|
||||
|
||||
* [Guide to Data Warehousing. Short and comprehensive information… | by Tomas Peluritis](https://towardsdatascience.com/guide-to-data-warehousing-6fdcf30b6fbe) (blog post)
|
||||
* [Snowflake, Redshift, BigQuery, and Others: Cloud Data Warehouse Tools Compared](https://www.altexsoft.com/blog/snowflake-redshift-bigquery-data-warehouse-tools/) (blog post)
|
||||
|
||||
|
||||
### Streaming
|
||||
|
||||
|
||||
* Building Streaming Analytics: The Journey and Learnings - Maxim Lukichev
|
||||
|
||||
### DataOps
|
||||
|
||||
* [DataOps 101 with Lars Albertsson – DataTalks.Club](https://datatalks.club/podcast/s02e11-dataops.html) (podcast)
|
||||
*
|
||||
|
||||
|
||||
### Monitoring and observability
|
||||
|
||||
* [Data Observability: The Next Frontier of Data Engineering with Barr Moses](https://datatalks.club/podcast/s03e03-data-observability.html) (podcast)
|
||||
|
||||
|
||||
### Analytics engineering
|
||||
|
||||
* [Analytics Engineer: New Role in a Data Team with Victoria Perez Mola](https://datatalks.club/podcast/s03e11-analytics-engineer.html) (podcast)
|
||||
* [Modern Data Stack for Analytics Engineering - Kyle Shannon](https://www.youtube.com/watch?v=UmIZIkeOfi0) (video)
|
||||
* [Analytics Engineering vs Data Engineering | RudderStack Blog](https://www.rudderstack.com/blog/analytics-engineering-vs-data-engineering) (blog post)
|
||||
* [Learn the Fundamentals of Analytics Engineering with dbt](https://courses.getdbt.com/courses/fundamentals) (course)
|
||||
|
||||
|
||||
### Data mesh
|
||||
|
||||
* [Data Mesh in Practice - Max Schultze](https://www.youtube.com/watch?v=ekEc8D_D3zY) (video)
|
||||
|
||||
### Cloud
|
||||
|
||||
* [https://acceldataio.medium.com/data-engineering-best-practices-how-netflix-keeps-its-data-infrastructure-cost-effective-dee310bcc910](https://acceldataio.medium.com/data-engineering-best-practices-how-netflix-keeps-its-data-infrastructure-cost-effective-dee310bcc910)
|
||||
|
||||
|
||||
### Reverse ETL
|
||||
|
||||
* TODO: What is reverse ETL?
|
||||
* [https://datatalks.club/podcast/s05e02-data-engineering-acronyms.html](https://datatalks.club/podcast/s05e02-data-engineering-acronyms.html)
|
||||
* [Open-Source Spotlight - Grouparoo - Brian Leonard](https://www.youtube.com/watch?v=hswlcgQZYuw) (video)
|
||||
* [Open-Source Spotlight - Castled.io (Reverse ETL) - Arun Thulasidharan](https://www.youtube.com/watch?v=iW0XhltAUJ8) (video)
|
||||
|
||||
## Career in Data Engineering
|
||||
|
||||
* [From Data Science to Data Engineering with Ellen König – DataTalks.Club](https://datatalks.club/podcast/s07e08-from-data-science-to-data-engineering.html) (podcast)
|
||||
* [Big Data Engineer vs Data Scientist with Roksolana Diachuk – DataTalks.Club](https://datatalks.club/podcast/s04e03-big-data-engineer-vs-data-scientist.html) (podcast)
|
||||
* [What Skills Do You Need to Become a Data Engineer](https://www.linkedin.com/pulse/what-skills-do-you-need-become-data-engineer-peng-wang/) (blog post)
|
||||
* [The future history of Data Engineering](https://groupby1.substack.com/p/data-engineering?s=r) (blog post)
|
||||
* [What Skills Do Data Engineers Need](https://www.theseattledataguy.com/what-skills-do-data-engineers-need/) (blog post)
|
||||
|
||||
### Data Engineering Management
|
||||
|
||||
* [Becoming a Data Engineering Manager with Rahul Jain – DataTalks.Club](https://datatalks.club/podcast/s07e07-becoming-a-data-engineering-manager.html) (podcast)
|
||||
|
||||
## Data engineering projects
|
||||
|
||||
* [How To Start A Data Engineering Project - With Data Engineering Project Ideas](https://www.youtube.com/watch?v=WpN47Jddo7I) (video)
|
||||
* [Data Engineering Project for Beginners - Batch edition](https://www.startdataengineering.com/post/data-engineering-project-for-beginners-batch-edition/) (blog post)
|
||||
* [Building a Data Engineering Project in 20 Minutes](https://www.sspaeti.com/blog/data-engineering-project-in-twenty-minutes/) (blog post)
|
||||
* [Automating Nike Run Club Data Analysis with Python, Airflow and Google Data Studio | by Rich Martin | Medium](https://medium.com/@rich_23525/automating-nike-run-club-data-analysis-with-python-airflow-and-google-data-studio-3c9556478926) (blog post)
|
||||
|
||||
|
||||
## Data Engineering Resources
|
||||
|
||||
### Blogs
|
||||
|
||||
* [Start Data Engineering](https://www.startdataengineering.com/)
|
||||
|
||||
### Podcasts
|
||||
|
||||
* [The Data Engineering Podcast](https://www.dataengineeringpodcast.com/)
|
||||
* [DataTalks.Club Podcast](https://datatalks.club/podcast.html) (only some episodes are about data engineering)
|
||||
*
|
||||
|
||||
### Communities
|
||||
|
||||
* [DataTalks.Club](https://datatalks.club/)
|
||||
* [/r/dataengineering](https://www.reddit.com/r/dataengineering)
|
||||
|
||||
|
||||
### Meetups
|
||||
|
||||
* [Sydney Data Engineers](https://sydneydataengineers.github.io/)
|
||||
|
||||
### People to follow on Twitter and LinkedIn
|
||||
|
||||
* TODO
|
||||
|
||||
### YouTube channels
|
||||
|
||||
* [Karolina Sowinska - YouTube](https://www.youtube.com/channel/UCAxnMry1lETl47xQWABvH7g) x`
|
||||
* [Seattle Data Guy - YouTube](https://www.youtube.com/c/SeattleDataGuy)
|
||||
* [Andreas Kretz - YouTube](https://www.youtube.com/c/andreaskayy)
|
||||
* [DataTalksClub - YouTube](https://youtube.com/c/datatalksclub) (only some videos are about data engineering)
|
||||
|
||||
### Resource aggregators
|
||||
|
||||
* [Reading List](https://www.scling.com/reading-list/) by Lars Albertsson
|
||||
* [GitHub - igorbarinov/awesome-data-engineering](https://github.com/igorbarinov/awesome-data-engineering) (focus is more on tools)
|
||||
|
||||
|
||||
## License
|
||||
|
||||
This work is licensed under a Creative Commons Attribution 4.0 International License.
|
||||
|
||||
CC BY 4.0
|
||||
@ -24,14 +24,14 @@ def compute_certificate_id(email):
|
||||
Then use this hash to get the URL
|
||||
|
||||
```python
|
||||
cohort = 2023
|
||||
cohort = 2024
|
||||
course = 'dezoomcamp'
|
||||
your_id = compute_certificate_id('never.give.up@gmail.com')
|
||||
url = f"https://certificate.datatalks.club/{course}/{cohort}/{your_id}.pdf"
|
||||
print(url)
|
||||
```
|
||||
|
||||
Example: https://certificate.datatalks.club/dezoomcamp/2023/fe629854d45c559e9c10b3b8458ea392fdeb68a9.pdf
|
||||
Example: https://certificate.datatalks.club/dezoomcamp/2024/fe629854d45c559e9c10b3b8458ea392fdeb68a9.pdf
|
||||
|
||||
|
||||
## Adding to LinkedIn
|
||||
191
cohorts/2024/02-workflow-orchestration/README.md
Normal file
191
cohorts/2024/02-workflow-orchestration/README.md
Normal file
@ -0,0 +1,191 @@
|
||||
> [!NOTE]
|
||||
>If you're looking for Airflow videos from the 2022 edition, check the [2022 cohort folder](../cohorts/2022/week_2_data_ingestion/).
|
||||
>
|
||||
>If you're looking for Prefect videos from the 2023 edition, check the [2023 cohort folder](../cohorts/2023/week_2_data_ingestion/).
|
||||
|
||||
# Week 2: Workflow Orchestration
|
||||
|
||||
Welcome to Week 2 of the Data Engineering Zoomcamp! 🚀😤 This week, we'll be covering workflow orchestration with Mage.
|
||||
|
||||
Mage is an open-source, hybrid framework for transforming and integrating data. ✨
|
||||
|
||||
This week, you'll learn how to use the Mage platform to author and share _magical_ data pipelines. This will all be covered in the course, but if you'd like to learn a bit more about Mage, check out our docs [here](https://docs.mage.ai/introduction/overview).
|
||||
|
||||
* [2.2.1 - 📯 Intro to Orchestration](#221----intro-to-orchestration)
|
||||
* [2.2.2 - 🧙♂️ Intro to Mage](#222---%EF%B8%8F-intro-to-mage)
|
||||
* [2.2.3 - 🐘 ETL: API to Postgres](#223----etl-api-to-postgres)
|
||||
* [2.2.4 - 🤓 ETL: API to GCS](#224----etl-api-to-gcs)
|
||||
* [2.2.5 - 🔍 ETL: GCS to BigQuery](#225----etl-gcs-to-bigquery)
|
||||
* [2.2.6 - 👨💻 Parameterized Execution](#226----parameterized-execution)
|
||||
* [2.2.7 - 🤖 Deployment (Optional)](#227----deployment-optional)
|
||||
* [2.2.8 - 🗒️ Homework](#228---️-homework)
|
||||
* [2.2.9 - 👣 Next Steps](#229----next-steps)
|
||||
|
||||
## 📕 Course Resources
|
||||
|
||||
### 2.2.1 - 📯 Intro to Orchestration
|
||||
|
||||
In this section, we'll cover the basics of workflow orchestration. We'll discuss what it is, why it's important, and how it can be used to build data pipelines.
|
||||
|
||||
Videos
|
||||
- 2.2.1a - What is Orchestration?
|
||||
|
||||
[](https://youtu.be/Li8-MWHhTbo&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=17)
|
||||
|
||||
Resources
|
||||
- [Slides](https://docs.google.com/presentation/d/17zSxG5Z-tidmgY-9l7Al1cPmz4Slh4VPK6o2sryFYvw/)
|
||||
|
||||
### 2.2.2 - 🧙♂️ Intro to Mage
|
||||
|
||||
In this section, we'll introduce the Mage platform. We'll cover what makes Mage different from other orchestrators, the fundamental concepts behind Mage, and how to get started. To cap it off, we'll spin Mage up via Docker 🐳 and run a simple pipeline.
|
||||
|
||||
Videos
|
||||
- 2.2.2a - What is Mage?
|
||||
|
||||
[](https://youtu.be/AicKRcK3pa4&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=18)
|
||||
|
||||
- 2.2.2b - Configuring Mage
|
||||
|
||||
[](https://youtu.be/tNiV7Wp08XE&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=19)
|
||||
|
||||
- 2.2.2c - A Simple Pipeline
|
||||
|
||||
[](https://youtu.be/stI-gg4QBnI&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=20)
|
||||
|
||||
Resources
|
||||
- [Getting Started Repo](https://github.com/mage-ai/mage-zoomcamp)
|
||||
- [Slides](https://docs.google.com/presentation/d/1y_5p3sxr6Xh1RqE6N8o2280gUzAdiic2hPhYUUD6l88/)
|
||||
|
||||
### 2.2.3 - 🐘 ETL: API to Postgres
|
||||
|
||||
Hooray! Mage is up and running. Now, let's build a _real_ pipeline. In this section, we'll build a simple ETL pipeline that loads data from an API into a Postgres database. Our database will be built using Docker— it will be running locally, but it's the same as if it were running in the cloud.
|
||||
|
||||
Videos
|
||||
- 2.2.3a - Configuring Postgres
|
||||
|
||||
[](https://youtu.be/pmhI-ezd3BE&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=21)
|
||||
|
||||
- 2.2.3b - Writing an ETL Pipeline : API to postgres
|
||||
|
||||
[](https://youtu.be/Maidfe7oKLs&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=22)
|
||||
|
||||
|
||||
### 2.2.4 - 🤓 ETL: API to GCS
|
||||
|
||||
Ok, so we've written data _locally_ to a database, but what about the cloud? In this tutorial, we'll walk through the process of using Mage to extract, transform, and load data from an API to Google Cloud Storage (GCS).
|
||||
|
||||
We'll cover both writing _partitioned_ and _unpartitioned_ data to GCS and discuss _why_ you might want to do one over the other. Many data teams start with extracting data from a source and writing it to a data lake _before_ loading it to a structured data source, like a database.
|
||||
|
||||
Videos
|
||||
- 2.2.4a - Configuring GCP
|
||||
|
||||
[](https://youtu.be/00LP360iYvE&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=23)
|
||||
|
||||
- 2.2.4b - Writing an ETL Pipeline : API to GCS
|
||||
|
||||
[](https://youtu.be/w0XmcASRUnc&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=24)
|
||||
|
||||
Resources
|
||||
- [DTC Zoomcamp GCP Setup](../01-docker-terraform/1_terraform_gcp/2_gcp_overview.md)
|
||||
|
||||
### 2.2.5 - 🔍 ETL: GCS to BigQuery
|
||||
|
||||
Now that we've written data to GCS, let's load it into BigQuery. In this section, we'll walk through the process of using Mage to load our data from GCS to BigQuery. This closely mirrors a very common data engineering workflow: loading data from a data lake into a data warehouse.
|
||||
|
||||
Videos
|
||||
- 2.2.5a - Writing an ETL Pipeline : GCS to BigQuery
|
||||
|
||||
[](https://youtu.be/JKp_uzM-XsM&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=25)
|
||||
|
||||
### 2.2.6 - 👨💻 Parameterized Execution
|
||||
|
||||
By now you're familiar with building pipelines, but what about adding parameters? In this video, we'll discuss some built-in runtime variables that exist in Mage and show you how to define your own! We'll also cover how to use these variables to parameterize your pipelines. Finally, we'll talk about what it means to *backfill* a pipeline and how to do it in Mage.
|
||||
|
||||
Videos
|
||||
- 2.2.6a - Parameterized Execution
|
||||
|
||||
[](https://youtu.be/H0hWjWxB-rg&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=26)
|
||||
|
||||
|
||||
- 2.2.6b - Backfills
|
||||
|
||||
[](https://youtu.be/ZoeC6Ag5gQc&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=27)
|
||||
|
||||
Resources
|
||||
- [Mage Variables Overview](https://docs.mage.ai/development/variables/overview)
|
||||
- [Mage Runtime Variables](https://docs.mage.ai/getting-started/runtime-variable)
|
||||
|
||||
### 2.2.7 - 🤖 Deployment (Optional)
|
||||
|
||||
In this section, we'll cover deploying Mage using Terraform and Google Cloud. This section is optional— it's not *necessary* to learn Mage, but it might be helpful if you're interested in creating a fully deployed project. If you're using Mage in your final project, you'll need to deploy it to the cloud.
|
||||
|
||||
Videos
|
||||
- 2.2.7a - Deployment Prerequisites
|
||||
|
||||
[](https://youtu.be/zAwAX5sxqsg&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=28)
|
||||
|
||||
- 2.2.7b - Google Cloud Permissions
|
||||
|
||||
[](https://youtu.be/O_H7DCmq2rA&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=29)
|
||||
|
||||
- 2.2.7c - Deploying to Google Cloud - Part 1
|
||||
|
||||
[](https://youtu.be/9A872B5hb_0&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=30)
|
||||
|
||||
- 2.2.7d - Deploying to Google Cloud - Part 2
|
||||
|
||||
[](https://youtu.be/0YExsb2HgLI&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=31)
|
||||
|
||||
Resources
|
||||
- [Installing Terraform](https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli)
|
||||
- [Installing `gcloud` CLI](https://cloud.google.com/sdk/docs/install)
|
||||
- [Mage Terraform Templates](https://github.com/mage-ai/mage-ai-terraform-templates)
|
||||
|
||||
Additional Mage Guides
|
||||
- [Terraform](https://docs.mage.ai/production/deploying-to-cloud/using-terraform)
|
||||
- [Deploying to GCP with Terraform](https://docs.mage.ai/production/deploying-to-cloud/gcp/setup)
|
||||
|
||||
### 2.2.8 - 🗒️ Homework
|
||||
|
||||
We've prepared a short exercise to test you on what you've learned this week. You can find the homework [here](../cohorts/2024/02-workflow-orchestration/homework.md). This follows closely from the contents of the course and shouldn't take more than an hour or two to complete. 😄
|
||||
|
||||
### 2.2.9 - 👣 Next Steps
|
||||
|
||||
Congratulations! You've completed Week 2 of the Data Engineering Zoomcamp. We hope you've enjoyed learning about Mage and that you're excited to use it in your final project. If you have any questions, feel free to reach out to us on Slack. Be sure to check out our "Next Steps" video for some inspiration for the rest of your journey 😄.
|
||||
|
||||
Videos
|
||||
- 2.2.9 - Next Steps
|
||||
|
||||
[](https://youtu.be/uUtj7N0TleQ&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=32)
|
||||
|
||||
Resources
|
||||
- [Slides](https://docs.google.com/presentation/d/1yN-e22VNwezmPfKrZkgXQVrX5owDb285I2HxHWgmAEQ/edit#slide=id.g262fb0d2905_0_12)
|
||||
|
||||
### 📑 Additional Resources
|
||||
|
||||
- [Mage Docs](https://docs.mage.ai/)
|
||||
- [Mage Guides](https://docs.mage.ai/guides)
|
||||
- [Mage Slack](https://www.mage.ai/chat)
|
||||
|
||||
|
||||
# Community notes
|
||||
|
||||
Did you take notes? You can share them here:
|
||||
|
||||
## 2024 notes
|
||||
|
||||
* [2024 Videos transcripts week 2](https://drive.google.com/drive/folders/1yxT0uMMYKa6YOxanh91wGqmQUMS7yYW7?usp=sharing) by Maria Fisher
|
||||
* [Notes from Jonah Oliver](https://www.jonahboliver.com/blog/de-zc-w2)
|
||||
* [Notes from Linda](https://github.com/inner-outer-space/de-zoomcamp-2024/blob/main/2-workflow-orchestration/readme.md)
|
||||
* [Notes from Kirill](https://github.com/kirill505/data-engineering-zoomcamp/blob/main/02-workflow-orchestration/README.md)
|
||||
* [Notes from Zharko](https://www.zharconsulting.com/contents/data/data-engineering-bootcamp-2024/week-2-ingesting-data-with-mage/)
|
||||
* Add your notes above this line
|
||||
|
||||
## 2023 notes
|
||||
|
||||
See [here](../cohorts/2023/week_2_workflow_orchestration#community-notes)
|
||||
|
||||
|
||||
## 2022 notes
|
||||
|
||||
See [here](../cohorts/2022/week_2_data_ingestion#community-notes)
|
||||
@ -1,4 +1,6 @@
|
||||
## Week 5 Homework
|
||||
## Module 5 Homework
|
||||
|
||||
Solution: https://www.youtube.com/watch?v=YtddC7vJOgQ
|
||||
|
||||
In this homework we'll put what we learned about Spark in practice.
|
||||
|
||||
|
||||
34
cohorts/2024/06-streaming/docker-compose.yml
Normal file
34
cohorts/2024/06-streaming/docker-compose.yml
Normal file
@ -0,0 +1,34 @@
|
||||
version: '3.7'
|
||||
services:
|
||||
# Redpanda cluster
|
||||
redpanda-1:
|
||||
image: docker.redpanda.com/vectorized/redpanda:v22.3.5
|
||||
container_name: redpanda-1
|
||||
command:
|
||||
- redpanda
|
||||
- start
|
||||
- --smp
|
||||
- '1'
|
||||
- --reserve-memory
|
||||
- 0M
|
||||
- --overprovisioned
|
||||
- --node-id
|
||||
- '1'
|
||||
- --kafka-addr
|
||||
- PLAINTEXT://0.0.0.0:29092,OUTSIDE://0.0.0.0:9092
|
||||
- --advertise-kafka-addr
|
||||
- PLAINTEXT://redpanda-1:29092,OUTSIDE://localhost:9092
|
||||
- --pandaproxy-addr
|
||||
- PLAINTEXT://0.0.0.0:28082,OUTSIDE://0.0.0.0:8082
|
||||
- --advertise-pandaproxy-addr
|
||||
- PLAINTEXT://redpanda-1:28082,OUTSIDE://localhost:8082
|
||||
- --rpc-addr
|
||||
- 0.0.0.0:33145
|
||||
- --advertise-rpc-addr
|
||||
- redpanda-1:33145
|
||||
ports:
|
||||
# - 8081:8081
|
||||
- 8082:8082
|
||||
- 9092:9092
|
||||
- 28082:28082
|
||||
- 29092:29092
|
||||
@ -2,111 +2,313 @@
|
||||
|
||||
In this homework, we're going to extend Module 5 Homework and learn about streaming with PySpark.
|
||||
|
||||
Instead of Kafka, we will use Red Panda, which is a drop-in
|
||||
replacement for Kafka.
|
||||
|
||||
Ensure you have the following set up (if you had done the previous homework and the module):
|
||||
|
||||
- Docker
|
||||
- PySpark
|
||||
- Docker (see [module 1](https://github.com/DataTalksClub/data-engineering-zoomcamp/tree/main/01-docker-terraform))
|
||||
- PySpark (see [module 5](https://github.com/DataTalksClub/data-engineering-zoomcamp/tree/main/05-batch/setup))
|
||||
|
||||
For this homework we will be using the files from Module 5 Homework,
|
||||
For this homework we will be using the files from Module 5 homework:
|
||||
|
||||
- FHV 2019-10 data found here: [FHV Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz), and
|
||||
- Green 2019-10 data found here: [Green Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-10.csv.gz)
|
||||
- Green 2019-10 data from [here](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-10.csv.gz)
|
||||
|
||||
|
||||
|
||||
## Pre-setup
|
||||
## Start Red Panda
|
||||
|
||||
1. Extract and place the csv files in the paths under `resources` subfolder
|
||||
Let's start redpanda in a docker container.
|
||||
|
||||
There's a `docker-compose.yml` file in the homework folder (taken from [here](https://github.com/redpanda-data-blog/2023-python-gsg/blob/main/docker-compose.yml))
|
||||
|
||||
## Spin up the containers
|
||||
|
||||
|
||||
|
||||
Set rpk alias:
|
||||
```bash
|
||||
alias rpk="docker exec -it redpanda-1 rpk"
|
||||
```
|
||||
|
||||
### Question 1
|
||||
|
||||
Run following code to start. What is the `rpk` console version?
|
||||
Copy this file to your homework directory and run
|
||||
|
||||
```bash
|
||||
rpk --version
|
||||
docker-compose up
|
||||
```
|
||||
|
||||
## Running Producer
|
||||
(Add `-d` if you want to run in detached mode)
|
||||
|
||||
|
||||
## Question 1: Redpanda version
|
||||
|
||||
Now let's find out the version of redpandas.
|
||||
|
||||
For that, check the output of the command `rpk help` _inside the container_. The name of the container is `redpanda-1`.
|
||||
|
||||
Find out what you need to execute based on the `help` output.
|
||||
|
||||
What's the version, based on the output of the command you executed? (copy the entire version)
|
||||
|
||||
|
||||
## Question 2. Creating a topic
|
||||
|
||||
Before we can send data to the redpanda server, we
|
||||
need to create a topic. We do it also with the `rpk`
|
||||
command we used previously for figuring out the version of
|
||||
redpandas.
|
||||
|
||||
Read the output of `help` and based on it, create a topic with name `test-topic`
|
||||
|
||||
What's the output of the command for creating a topic? Include the entire output in your answer.
|
||||
|
||||
|
||||
## Question 3. Connecting to the Kafka server
|
||||
|
||||
We need to make sure we can connect to the server, so
|
||||
later we can send some data to its topics
|
||||
|
||||
First, let's install the kafka connector (up to you if you
|
||||
want to have a separate virtual environment for that)
|
||||
|
||||
```bash
|
||||
# Run Producers for the two datasets
|
||||
python producer.py --type fhv
|
||||
python producer.py --type green
|
||||
pip install kafka-python
|
||||
```
|
||||
|
||||
### Running Streaming Script
|
||||
You can start a jupyter notebook in your solution folder or
|
||||
create a script
|
||||
|
||||
spark-submit script ensures installation of necessary jars before running the streaming.py
|
||||
Let's try to connect to our server:
|
||||
|
||||
```python
|
||||
import json
|
||||
import time
|
||||
|
||||
from kafka import KafkaProducer
|
||||
|
||||
def json_serializer(data):
|
||||
return json.dumps(data).encode('utf-8')
|
||||
|
||||
server = 'localhost:9092'
|
||||
|
||||
producer = KafkaProducer(
|
||||
bootstrap_servers=[server],
|
||||
value_serializer=json_serializer
|
||||
)
|
||||
|
||||
producer.bootstrap_connected()
|
||||
```
|
||||
|
||||
Provided that you can connect to the server, what's the output
|
||||
of the last command?
|
||||
|
||||
|
||||
## Question 4. Sending data to the stream
|
||||
|
||||
Now we're ready to send some test data:
|
||||
|
||||
```python
|
||||
t0 = time.time()
|
||||
|
||||
topic_name = 'test-topic'
|
||||
|
||||
for i in range(10):
|
||||
message = {'number': i}
|
||||
producer.send(topic_name, value=message)
|
||||
print(f"Sent: {message}")
|
||||
time.sleep(0.05)
|
||||
|
||||
producer.flush()
|
||||
|
||||
t1 = time.time()
|
||||
print(f'took {(t1 - t0):.2f} seconds')
|
||||
```
|
||||
|
||||
How much time did it take? Where did it spend most of the time?
|
||||
|
||||
* Sending the messages
|
||||
* Flushing
|
||||
* Both took approximately the same amount of time
|
||||
|
||||
(Don't remove `time.sleep` when answering this question)
|
||||
|
||||
|
||||
## Reading data with `rpk`
|
||||
|
||||
You can see the messages that you send to the topic
|
||||
with `rpk`:
|
||||
|
||||
```bash
|
||||
./spark-submit.sh streaming.py
|
||||
rpk topic consume test-topic
|
||||
```
|
||||
|
||||
### Question 2
|
||||
Run the command above and send the messages one more time to
|
||||
see them
|
||||
|
||||
**What is the most popular pickup location for FHV type taxi rides?**
|
||||
|
||||
- 1
|
||||
- 2
|
||||
- 3
|
||||
- 4
|
||||
## Sending the taxi data
|
||||
|
||||
## Running Consumer
|
||||
Now let's send our actual data:
|
||||
|
||||
```bash
|
||||
# Run consumer with default settings
|
||||
python3 consumer.py
|
||||
# Run consumer for specific topic
|
||||
python3 consumer.py --topic [topic-name]
|
||||
* Read the green csv.gz file
|
||||
* We will only need these columns:
|
||||
* `'lpep_pickup_datetime',`
|
||||
* `'lpep_dropoff_datetime',`
|
||||
* `'PULocationID',`
|
||||
* `'DOLocationID',`
|
||||
* `'passenger_count',`
|
||||
* `'trip_distance',`
|
||||
* `'tip_amount'`
|
||||
|
||||
Iterate over the records in the dataframe
|
||||
|
||||
```python
|
||||
for row in df_green.itertuples(index=False):
|
||||
row_dict = {col: getattr(row, col) for col in row._fields}
|
||||
print(row_dict)
|
||||
break
|
||||
|
||||
# TODO implement sending the data here
|
||||
```
|
||||
|
||||
### Question 4:
|
||||
most popular PUlocationID for fhv trip taxis
|
||||
Note: this way of iterating over the records is more efficient compared
|
||||
to `iterrows`
|
||||
|
||||
|
||||
## Question 5: Sending the Trip Data
|
||||
|
||||
### Question 5:
|
||||
least popular DOlocationID for fhv trip taxis
|
||||
* Create a topic `green-trips` and send the data there
|
||||
* How much time in seconds did it take? (You can round it to a whole number)
|
||||
* Make sure you don't include sleeps in your code
|
||||
|
||||
|
||||
## Creating the PySpark consumer
|
||||
|
||||
## Question
|
||||
Now let's read the data with PySpark.
|
||||
|
||||
```bash
|
||||
rpk cluster info
|
||||
rpk topic list --detailed
|
||||
Spark needs a library (jar) to be able to connect to Kafka,
|
||||
so we need to tell PySpark that it needs to use it:
|
||||
|
||||
```python
|
||||
import pyspark
|
||||
from pyspark.sql import SparkSession
|
||||
|
||||
pyspark_version = pyspark.__version__
|
||||
kafka_jar_package = f"org.apache.spark:spark-sql-kafka-0-10_2.12:{pyspark_version}"
|
||||
|
||||
spark = SparkSession \
|
||||
.builder \
|
||||
.master("local[*]") \
|
||||
.appName("GreenTripsConsumer") \
|
||||
.config("spark.jars.packages", kafka_jar_package) \
|
||||
.getOrCreate()
|
||||
```
|
||||
|
||||
Create topic `rides_all` using the `rpk` CLI command in the terminal.
|
||||
Now we can connect to the stream:
|
||||
|
||||
Which of these is the correct command to create topic with 1 partitions and 1 replica?
|
||||
```python
|
||||
green_stream = spark \
|
||||
.readStream \
|
||||
.format("kafka") \
|
||||
.option("kafka.bootstrap.servers", "localhost:9092") \
|
||||
.option("subscribe", "green-trips") \
|
||||
.option("startingOffsets", "earliest") \
|
||||
.load()
|
||||
```
|
||||
|
||||
- `rpk topics creates rides_all --partitions 12 --replicas 1`
|
||||
- `rpk topic rides_all --partitions 1 --replicas 1`
|
||||
- `rpk topic create list rides_all --partitions 1 --replicas 1`
|
||||
- `rpk topic create rides_all --partitions 1 --replicas 1`
|
||||
In order to test that we can consume from the stream,
|
||||
let's see what will be the first record there.
|
||||
|
||||
Run the correct command in the terminal to create the topic.
|
||||
In Spark streaming, the stream is represented as a sequence of
|
||||
small batches, each batch being a small RDD (or a small dataframe).
|
||||
|
||||
So we can execute a function over each mini-batch.
|
||||
Let's run `take(1)` there to see what do we have in the stream:
|
||||
|
||||
```python
|
||||
def peek(mini_batch, batch_id):
|
||||
first_row = mini_batch.take(1)
|
||||
|
||||
if first_row:
|
||||
print(first_row[0])
|
||||
|
||||
query = green_stream.writeStream.foreachBatch(peek).start()
|
||||
```
|
||||
|
||||
You should see a record like this:
|
||||
|
||||
```
|
||||
Row(key=None, value=bytearray(b'{"lpep_pickup_datetime": "2019-10-01 00:26:02", "lpep_dropoff_datetime": "2019-10-01 00:39:58", "PULocationID": 112, "DOLocationID": 196, "passenger_count": 1.0, "trip_distance": 5.88, "tip_amount": 0.0}'), topic='green-trips', partition=0, offset=0, timestamp=datetime.datetime(2024, 3, 12, 22, 42, 9, 411000), timestampType=0)
|
||||
```
|
||||
|
||||
Now let's stop the query, so it doesn't keep consuming messages
|
||||
from the stream
|
||||
|
||||
```python
|
||||
query.stop()
|
||||
```
|
||||
|
||||
## Question 6. Parsing the data
|
||||
|
||||
The data is JSON, but currently it's in binary format. We need
|
||||
to parse it and turn it into a streaming dataframe with proper
|
||||
columns.
|
||||
|
||||
Similarly to PySpark, we define the schema
|
||||
|
||||
```python
|
||||
from pyspark.sql import types
|
||||
|
||||
schema = types.StructType() \
|
||||
.add("lpep_pickup_datetime", types.StringType()) \
|
||||
.add("lpep_dropoff_datetime", types.StringType()) \
|
||||
.add("PULocationID", types.IntegerType()) \
|
||||
.add("DOLocationID", types.IntegerType()) \
|
||||
.add("passenger_count", types.DoubleType()) \
|
||||
.add("trip_distance", types.DoubleType()) \
|
||||
.add("tip_amount", types.DoubleType())
|
||||
```
|
||||
|
||||
And apply this schema:
|
||||
|
||||
```python
|
||||
from pyspark.sql import functions as F
|
||||
|
||||
green_stream = green_stream \
|
||||
.select(F.from_json(F.col("value").cast('STRING'), schema).alias("data")) \
|
||||
.select("data.*")
|
||||
```
|
||||
|
||||
How does the record look after parsing? Copy the output.
|
||||
|
||||
|
||||
### Question :
|
||||
most common locationID where a taxi can drop off and pickup a passenger at the same location within a 10min threshold (windowing lesson).
|
||||
### Question 7: Most popular destination
|
||||
|
||||
Now let's finally do some streaming analytics. We will
|
||||
see what's the most popular destination currently
|
||||
based on our stream of data (which ideally we should
|
||||
have sent with delays like we did in workshop 2)
|
||||
|
||||
|
||||
This is how you can do it:
|
||||
|
||||
* Add a column "timestamp" using the `current_timestamp` function
|
||||
* Group by:
|
||||
* 5 minutes window based on the timestamp column (`F.window(col("timestamp"), "5 minutes")`)
|
||||
* `"DOLocationID"`
|
||||
* Order by count
|
||||
|
||||
You can print the output to the console using this
|
||||
code
|
||||
|
||||
```python
|
||||
query = popular_destinations \
|
||||
.writeStream \
|
||||
.outputMode("complete") \
|
||||
.format("console") \
|
||||
.option("truncate", "false") \
|
||||
.start()
|
||||
|
||||
query.awaitTermination()
|
||||
```
|
||||
|
||||
Write the most popular destination, your answer should be *either* the zone ID or the zone name of this destination. (You will need to re-send the data for this to work)
|
||||
|
||||
<!-- scrap the above questions? -->
|
||||
|
||||
## Submitting the solutions
|
||||
|
||||
* Form for submitting: TBA
|
||||
* Form for submitting: https://courses.datatalks.club/de-zoomcamp-2024/homework/hw6
|
||||
|
||||
|
||||
## Solution
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
* [Course Google calendar](https://calendar.google.com/calendar/?cid=ZXIxcjA1M3ZlYjJpcXU0dTFmaG02MzVxMG9AZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ)
|
||||
* [FAQ](https://docs.google.com/document/d/19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw/edit?usp=sharing)
|
||||
* Course Playlist: Only 2024 Live videos & homeworks (TODO)
|
||||
* [Public Leaderboard of Top-100 Participants](leaderboard.md)
|
||||
|
||||
|
||||
[**Module 1: Introduction & Prerequisites**](01-docker-terraform/)
|
||||
|
||||
700
cohorts/2024/leaderboard.md
Normal file
700
cohorts/2024/leaderboard.md
Normal file
@ -0,0 +1,700 @@
|
||||
## Leaderboard
|
||||
|
||||
This is the top [100 leaderboard](https://courses.datatalks.club/de-zoomcamp-2024/leaderboard)
|
||||
of participants of Data Engineering Zoomcamp 2024 edition!
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th>Name</th>
|
||||
<th>Projects</th>
|
||||
<th>Social</th>
|
||||
<th>Comments</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Ashraf Mohammad</td>
|
||||
<td><a href="https://github.com/Ashraf1395/customer_retention_analytics"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a><a href="https://github.com/Ashraf1395/supply_chain_finance.git"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="www.linkedin.com/in/ashraf1395"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="www.github.com/Ashraf1395"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
Really Recommend this bootcamp , if you want to get hands on data engineering experience. My two Capstone project: www.github.com/Ashraf1395/supply_chain_finance, www.github.com/Ashraf1395/customer_retention_analytics
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Jorge Vladimir Abrego Arevalo</td>
|
||||
<td><a href="https://github.com/JorgeAbrego/weather_stream_project"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a><a href="https://github.com/JorgeAbrego/capital_bikeshare_project"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/jorge-abrego/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/JorgeAbrego"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Purnendu Shekhar Shukla</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Krishna Anand</td>
|
||||
<td><a href="https://github.com/anandaiml19/DE_Zoomcamp_Project2/tree/main"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a><a href="https://github.com/anandaiml19/Data-Engineering-Zoomcamp-Project1"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/krishna-anand-v-g-70bba623/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/anandaiml19"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Abhijit Chakraborty</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Hekmatullah Sajid</td>
|
||||
<td><a href="https://github.com/hekmatullah-sajid/EcoEnergy-Germany"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/hekmatullah-sajid/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/hekmatullah-sajid"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Lottie Jane Pollard</td>
|
||||
<td><a href="https://github.com/LottieJaneDev/usgs_earthquake_data_pipeline"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/lottiejanedev/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/LottieJaneDev/usgs_earthquake_data_pipeline"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>AviAnna</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Ketut Garjita</td>
|
||||
<td><a href="https://github.com/garjita63/dezoomcamp2024-project1"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/ketutgarjitadba/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/garjita63"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
I would like to express my thanks and appreciation to the Data Talks Club for organizing this excellent Data Engineering Zoomcamp training. This made me valuable experience in deepening new knowledge for me even though previously I had mostly worked as a Database Administrator for various platform databases. Thank you also to the community (datatalks-club.slack.com), especially slack course-data-engineering, as well as other slack communities such as mageai.slack.com.
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Diogo Costa</td>
|
||||
<td><a href="https://github.com/techwithcosta/youtube-ai-analytics"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/costadms/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/techwithcosta"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
Great course! Check out my YouTube channel: https://www.youtube.com/@TechWithCosta
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Francisco Ortiz Tena</td>
|
||||
<td><a href="https://github.com/FranciscoOrtizTena/de_zoomcamp_project_01/"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/francisco-ortiz-tena/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/FranciscoOrtizTena"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
It is an awesome course!
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Nevenka Lukic</td>
|
||||
<td><a href="https://github.com/nenalukic/air-quality-project"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/nevenka-lukic/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/nenalukic"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
This DE Zoomcamp was fantastic learning and networking experiences. Many thanks to organizers and big recommendations to anyone!
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Mukhammad Sofyan Rizka Akbar</td>
|
||||
<td><a href="https://github.com/SofyanAkbar94/Project-DE-Zoomcamp-2024"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://id.linkedin.com/in/m-sofyan-r-a-aa00a4118"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/SofyanAkbar94/"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
Thanks for providing this course, especially for Alexey and other Datatalk hosts and I hope I can join ML, ML Ops, and LLM Zoomcamp. See you soon :)
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Mahmoud Mahdy Zaky</td>
|
||||
<td><a href="https://github.com/MahmoudMahdy448/Football-Data-Analytics/tree/main"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/mahmoud-mahdy-zaky"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/MahmoudMahdy448"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Brilliant Pancake</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Jobert M. Gutierrez</td>
|
||||
<td><a href="https://github.com/bizzaccelerator/Footballers-transfers-Insights.git"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="www.linkedin.com/in/jobertgutierrez"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/bizzaccelerator"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Olusegun Samson Ayeni</td>
|
||||
<td><a href="https://github.com/iamraphson/IMDB-pipeline-project"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a><a href="https://github.com/iamraphson/DE-2024-project-book-recommendation"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/iamraphson/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/iamraphson"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Lily Chau</td>
|
||||
<td><a href="https://github.com/lilychau1/uk-power-analytics"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a><a href="https://github.com/lilychau1/uk-power-analytics/tree/main"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="www.linkedin.com/in/lilychau1"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/lilychau1"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
Big thank you to Alexey and all other speakers. This is one of the best online learning platforms I have ever come across.
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Aleksandr Kolmakov</td>
|
||||
<td><a href="https://github.com/Feanaur/marine-species-analytics"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a><a href="https://github.com/Feanaur/marine-species-analytics"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/aleksandr-kolmakov/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/alex-kolmakov"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Kang Zhi Yong</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Eduardo Muñoz Sala</td>
|
||||
<td><a href="https://github.com/edumunozsala/GDELT-Events-Data-Eng-Project"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/edumunozsala/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/edumunozsala"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Kirill Bazarov</td>
|
||||
<td><a href="https://github.com/kirill505/de-zoomcamp-project"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/kirill-bazarov-66ba3152"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/kirill505"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Shayan Shafiee Moghadam</td>
|
||||
<td><a href="https://github.com/shayansm2/DE-zoomcamp-playground/tree/de-zoomcamp-2nd-project/github-events-analyzer"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a><a href="https://github.com/shayansm2/tech-career-explorer/tree/de-zoomcamp-project"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/shayan-shafiee-moghadam/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/shayansm2"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Landry N.</td>
|
||||
<td><a href="https://github.com/drux31/capstone-dezoomcamp"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://github.com/drux31"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
Thanks for the awsome course.
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Condescending Austin</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Lee Durbin</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Loving Einstein</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Carlos Vecina Tebar</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Abiodun Oki</td>
|
||||
<td></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/okibaba/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/Okibaba"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
thoroughly enjoyed the course, great work Alexey & course team!
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Jimoh</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Sleepy Villani</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Ella Cinders</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Max Lutz</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Jessica De Silva</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Daniel Okello</td>
|
||||
<td></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/okellodaniel/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/okellodaniel"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Kirill Sitnikov</td>
|
||||
<td><a href="https://github.com/Siddha911/Citibike-data-project"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="Siddha911"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
Thank you Alexey and all DTC team! I’m so glad that I knew about your courses and projects!
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>edumad</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Duy Quoc Vo</td>
|
||||
<td><a href="https://github.com/voduyquoc/air_pollution_tracking"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/voduyquoc/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/voduyquoc"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
NA
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Xiang Li</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Sugeng Wahyudi</td>
|
||||
<td><a href="https://github.com/Gengsu07/DEGengsuProject"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/sugeng-wahyudi-8a3939132/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/Gengsu07"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
Thanks a lot, this was amazing. Can't miss another course and zoomcamp from datatalks.club
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Anatolii Kryvko</td>
|
||||
<td><a href="https://github.com/Nogromi/ukraine-vaccinations/tree/master"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/anatolii-kryvko-69b538107/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/Nogromi"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>David Vanegas</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Honey Badger</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Abdelrahman Kamal</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Jean Paul Rodriguez</td>
|
||||
<td><a href="https://github.com/jeanpaulrd1/de-zc-final-project"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/jean-paul-rodriguez"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/jeanpaulrd1/de-zc-final-project"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Eager Pasteur</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Damian Pszczoła</td>
|
||||
<td><a href="https://github.com/d4mp3/GLDAS-Data-Pipeline"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/damian-pszczo%C5%82a-7aba54241/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/d4mp3"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>ManPrat</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>forrest_parnassus</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Ramazan Abylkassov</td>
|
||||
<td><a href="https://github.com/ramazanabylkassov/aviation_stack_project"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/ramazan-abylkassov-23965097/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/ramazanabylkassov"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
Look mom, I am on leaderboard!
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Digamber Deshmukh</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Andrew Lee</td>
|
||||
<td><a href="https://github.com/wndrlxx/ca-trademarks-data-pipeline"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Matt R</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Raul Antonio Catacora Grundy</td>
|
||||
<td><a href="https://github.com/Cerpint4xt/data-engineering-all-news-project"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/raul-catacora-grundy-208315236/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/Cerpint4xt"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
I just want to thank everyone, all the instructors, collaborators for creating this amazing set of resources and such a solid community based on sharing and caring. Many many thanks and shout out to you guys
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Ranga H.</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Salma Gouda</td>
|
||||
<td><a href="https://github.com/salmagouda/data-engineering-capstone/tree/main"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://linkedin.com/in/salmagouda"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/salmagouda"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Artsiom Turevich</td>
|
||||
<td><a href="https://github.com/aturevich/zoomcamp_de_project"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/artsiom-turevich/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="a.turevich"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
A long time ago in a galaxy far, far away...
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Abhirup Ghosh</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Sonny Pham</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Peter Tran</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Ritika Tilwalia</td>
|
||||
<td><a href="https://github.com/rtilwalia/Fashion-Campus-Orders"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/ritika-tilwalia/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/rtilwalia"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Eager Yalow</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Dave Samaniego</td>
|
||||
<td><a href="https://github.com/nishiikata/de-zoomcamp-2024-mage-capstone"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/dave-s-32545014a"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/nishiikata"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
Thank you DataTalksClub for the course. It was challenging learning many new things, but I had fun along the way too!
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Lucid Keldysh</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Isaac Ndirangu Muturi</td>
|
||||
<td><a href="https://github.com/Isaac-Ndirangu-Muturi-749/End_to_end_data_pipeline--Optimizing_Online_Retail_Analytics_with_Data_and_Analytics_Engineering"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/isaac-muturi-3b6b2b237"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/Isaac-Ndirangu-Muturi-749"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
Amazing learning experience
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Agitated Wing</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Hanaa HAMMAD</td>
|
||||
<td></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/hanaahammad/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/hanaahammad"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
Grateful to this great course
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Jonah Oliver</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Paul Emilio Arizpe Colorado</td>
|
||||
<td><a href="https://github.com/kiramishima/crimes_in_mexico_city_analysis"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/parizpe/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/kiramishima"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
DataTalksClub brought me the opportunity to learn data engineering. Thanks for all :D
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Asma-Chloë FARAH</td>
|
||||
<td><a href="https://github.com/AsmaChloe/traffic_counting_paris"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/asma-chloefarah/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/AsmaChloe"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
Thank you for this amazing zoomcamp ! It was really fun !
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Happy Feistel</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Luca Pugliese</td>
|
||||
<td><a href="https://github.com/lucapug/nyc-bike-analytics"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/lucapug/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/lucapug"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
it has been a crowdlearning experience! starting in thousands of us. 359 graduated in the end. Proud to have classified 59th. Thanks to all.
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Jake Maund</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Aditya Phulallwar</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Dave Wilson</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Haitham Hussein Hamad</td>
|
||||
<td><a href="https://github.com/haithamhamad2/kaggle-survey"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/haitham-hamad-8926b415/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/haithamhamad2"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Alexandre Bergere aka Rocket</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>TOGBAN COKOUVI Joyce Elvis Mahoutondji</td>
|
||||
<td><a href="https://github.com/lvsuno/Github_data_analysis"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/elvistogban/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/lvsuno/Github_data_analysis"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Sad Robinson</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Tetiana Omelchenko</td>
|
||||
<td><a href="https://github.com/TOmelchenko/LifeExpectancyProject"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="www.linkedin.com/in/tetiana-omelchenko-35177379"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/TOmelchenko/LifeExpectancyProject"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Amanda Kershaw</td>
|
||||
<td><a href="https://github.com/ANKershaw/youtube_video_ranks"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/amandalnkershaw"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/ANKershaw/youtube_video_ranks"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
This course was incredibly rewarding and absolutely worth the effort.
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Kristjan Sert</td>
|
||||
<td><a href="https://github.com/KrisSert/cadaster-ee"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/kristjan-sert-043396131/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/KrisSert"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Murad Arfanyan</td>
|
||||
<td><a href="https://github.com/murkenson/movies_tv_shows_data_pipeline"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/murad-arfanyan-846786176/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/murkenson"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Ecstatic Hofstadter</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Chung Huu Tin</td>
|
||||
<td><a href="https://github.com/TinChung41/US-Accidents-Analysis-zoomcamp-project"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="linkedin.com/in/huu-tin-chung"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/TinChung41"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Zen Mayer</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Zhastay Yeltay</td>
|
||||
<td><a href="https://github.com/yelzha/tengrinews-open-project"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/yelzha/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/yelzha"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
;)
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>AV3NII</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Sebastian Alejandro Peralta Casafranca</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Relaxed Williams</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>George Mouratos</td>
|
||||
<td><a href="https://github.com/Gimour/Datatalks_final_project"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/gmouratos/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/Gimour/DataTalks, https://github.com/Gimour/Datatalks_final_project"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
-
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>mhmed ahmed rjb</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Frosty Jackson</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>WANJOHI</td>
|
||||
<td><a href="https://github.com/DE-ZoomCamp/Flood-Monitoring"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://github.com/DE-ZoomCamp/Flood-Monitoring"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Ighorr Holstrom</td>
|
||||
<td><a href="https://github.com/askeladden31/air_raids_data/"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/ighorr-holstrom/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/askeladden31"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Jesse Delzio</td>
|
||||
<td></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/delzioj"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/delzio"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Khalil El Daou</td>
|
||||
<td><a href="https://github.com/khalileldoau/global-news-engagement-on-social-media"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/khalil-el-daou-177a8b114?utm_source=share&utm_campaign=share_via&utm_content=profile&utm_medium=android_app"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/khalileldoau"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td><details>
|
||||
<summary>comment</summary>
|
||||
Already made a post about the zoomcamp
|
||||
</details></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Juan Rojas</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Gonçalo</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Muhamad Farikhin</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Bold Lederberg</td>
|
||||
<td></a></td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Taras Shalaiko</td>
|
||||
<td><a href="https://github.com/tarasenya/dezoomcamp_final_project"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></a></td>
|
||||
<td> <a href="https://www.linkedin.com/in/taras-shalaiko-30114a107/"><img src="https://user-images.githubusercontent.com/875246/192300614-2ce22ed5-bbc4-4684-8098-d8128d71aac5.png" height="16em" /></a> <a href="https://github.com/tarasenya"><img src="https://user-images.githubusercontent.com/875246/192300611-a606521b-cb76-4090-be8e-7cc21752b996.png" height="16em" /></a></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
</table>
|
||||
@ -25,51 +25,19 @@ so in total, you will make three submissions.
|
||||
|
||||
#### Project Attempt #1
|
||||
|
||||
Project:
|
||||
* Project: https://courses.datatalks.club/de-zoomcamp-2024/project/project1
|
||||
* Review: https://courses.datatalks.club/de-zoomcamp-2024/project/project1/eval
|
||||
|
||||
* Form: TBA
|
||||
* Deadline: TBA
|
||||
|
||||
Peer reviewing:
|
||||
|
||||
* Peer review assignments: TBA ("project-01" sheet)
|
||||
* Form: TBA
|
||||
* Deadline: TBA
|
||||
|
||||
Project feedback: TBA ("project-01" sheet)
|
||||
|
||||
|
||||
#### Project Attempt #1
|
||||
|
||||
Project:
|
||||
|
||||
* Form: TBA
|
||||
* Deadline: TBA
|
||||
|
||||
Peer reviewing:
|
||||
|
||||
* Peer review assignments: TBA ("project-02" sheet)
|
||||
* Form: TBA
|
||||
* Deadline: TBA
|
||||
|
||||
Project feedback: TBA ("project-02" sheet)
|
||||
#### Project Attempt #2
|
||||
|
||||
* Project: https://courses.datatalks.club/de-zoomcamp-2024/project/project2
|
||||
* Review: https://courses.datatalks.club/de-zoomcamp-2024/project/project2/eval
|
||||
|
||||
> **Important**: update your "Certificate name" here: https://courses.datatalks.club/de-zoomcamp-2024/enrollment -
|
||||
this is what we will use when generating certificates for you.
|
||||
|
||||
### Evaluation criteria
|
||||
|
||||
See [here](../../projects/README.md)
|
||||
See [here](../../week_7_project/README.md)
|
||||
|
||||
|
||||
### Misc
|
||||
|
||||
To get the hash for your project, use this function to hash your email:
|
||||
|
||||
```python
|
||||
from hashlib import sha1
|
||||
|
||||
def compute_hash(email):
|
||||
return sha1(email.lower().encode('utf-8')).hexdigest()
|
||||
```
|
||||
|
||||
Or use [this website](http://www.sha1-online.com/).
|
||||
|
||||
@ -79,8 +79,7 @@ Workshop video:
|
||||
|
||||
**Please setup the environment in [Getting Started](https://github.com/risingwavelabs/risingwave-data-talks-workshop-2024-03-04?tab=readme-ov-file#getting-started) and for the [Homework](https://github.com/risingwavelabs/risingwave-data-talks-workshop-2024-03-04/blob/main/homework.md#setting-up) first.**
|
||||
|
||||
|
||||
## Question 0
|
||||
### Question 0
|
||||
|
||||
_This question is just a warm-up to introduce dynamic filter, please attempt it before viewing its solution._
|
||||
|
||||
@ -114,7 +113,14 @@ CREATE MATERIALIZED VIEW latest_dropoff_time AS
|
||||
|
||||
### Question 1
|
||||
|
||||
Create a materialized view to compute the average, min and max trip time between each taxi zone.
|
||||
Create a materialized view to compute the average, min and max trip time **between each taxi zone**.
|
||||
|
||||
Note that we consider the do not consider `a->b` and `b->a` as the same trip pair.
|
||||
So as an example, you would consider the following trip pairs as different pairs:
|
||||
```plaintext
|
||||
Yorkville East -> Steinway
|
||||
Steinway -> Yorkville East
|
||||
```
|
||||
|
||||
From this MV, find the pair of taxi zones with the highest average trip time.
|
||||
You may need to use the [dynamic filter pattern](https://docs.risingwave.com/docs/current/sql-pattern-dynamic-filters/) for this.
|
||||
@ -128,9 +134,11 @@ Options:
|
||||
3. East Flatbush/Farragut, East Harlem North
|
||||
4. Midtown Center, University Heights/Morris Heights
|
||||
|
||||
p.s. The trip time between taxi zones does not take symmetricity into account, i.e. `A -> B` and `B -> A` are considered different trips. This applies to subsequent questions as well.
|
||||
|
||||
### Question 2
|
||||
|
||||
Recreate the MV(s) in question 1, to also find the number of trips for the pair of taxi zones with the highest average trip time.
|
||||
Recreate the MV(s) in question 1, to also find the **number of trips** for the pair of taxi zones with the highest average trip time.
|
||||
|
||||
Options:
|
||||
1. 5
|
||||
@ -141,8 +149,8 @@ Options:
|
||||
### Question 3
|
||||
|
||||
From the latest pickup time to 17 hours before, what are the top 3 busiest zones in terms of number of pickups?
|
||||
For example if the latest pickup time is 2020-01-01 12:00:00,
|
||||
then the query should return the top 3 busiest zones from 2020-01-01 11:00:00 to 2020-01-01 12:00:00.
|
||||
For example if the latest pickup time is 2020-01-01 17:00:00,
|
||||
then the query should return the top 3 busiest zones from 2020-01-01 00:00:00 to 2020-01-01 17:00:00.
|
||||
|
||||
HINT: You can use [dynamic filter pattern](https://docs.risingwave.com/docs/current/sql-pattern-dynamic-filters/)
|
||||
to create a filter condition based on the latest pickup time.
|
||||
|
||||
192
cohorts/2025/01-docker-terraform/homework.md
Normal file
192
cohorts/2025/01-docker-terraform/homework.md
Normal file
@ -0,0 +1,192 @@
|
||||
# Module 1 Homework: Docker & SQL
|
||||
|
||||
In this homework we'll prepare the environment and practice
|
||||
Docker and SQL
|
||||
|
||||
When submitting your homework, you will also need to include
|
||||
a link to your GitHub repository or other public code-hosting
|
||||
site.
|
||||
|
||||
This repository should contain the code for solving the homework.
|
||||
|
||||
When your solution has SQL or shell commands and not code
|
||||
(e.g. python files) file formad, include them directly in
|
||||
the README file of your repository.
|
||||
|
||||
|
||||
## Question 1. Understanding docker first run
|
||||
|
||||
Run docker with the `python:3.12.8` image in an interactive mode, use the entrypoint `bash`.
|
||||
|
||||
What's the version of `pip` in the image?
|
||||
|
||||
- 24.3.1
|
||||
- 24.2.1
|
||||
- 23.3.1
|
||||
- 23.2.1
|
||||
|
||||
|
||||
## Question 2. Understanding Docker networking and docker-compose
|
||||
|
||||
Given the following `docker-compose.yaml`, what is the `hostname` and `port` that **pgadmin** should use to connect to the postgres database?
|
||||
|
||||
```yaml
|
||||
services:
|
||||
db:
|
||||
container_name: postgres
|
||||
image: postgres:17-alpine
|
||||
environment:
|
||||
POSTGRES_USER: 'postgres'
|
||||
POSTGRES_PASSWORD: 'postgres'
|
||||
POSTGRES_DB: 'ny_taxi'
|
||||
ports:
|
||||
- '5433:5432'
|
||||
volumes:
|
||||
- vol-pgdata:/var/lib/postgresql/data
|
||||
|
||||
pgadmin:
|
||||
container_name: pgadmin
|
||||
image: dpage/pgadmin4:latest
|
||||
environment:
|
||||
PGADMIN_DEFAULT_EMAIL: "pgadmin@pgadmin.com"
|
||||
PGADMIN_DEFAULT_PASSWORD: "pgadmin"
|
||||
ports:
|
||||
- "8080:80"
|
||||
volumes:
|
||||
- vol-pgadmin_data:/var/lib/pgadmin
|
||||
|
||||
volumes:
|
||||
vol-pgdata:
|
||||
name: vol-pgdata
|
||||
vol-pgadmin_data:
|
||||
name: vol-pgadmin_data
|
||||
```
|
||||
|
||||
- postgres:5433
|
||||
- localhost:5432
|
||||
- db:5433
|
||||
- postgres:5432
|
||||
- db:5432
|
||||
|
||||
|
||||
## Prepare Postgres
|
||||
|
||||
Run Postgres and load data as shown in the videos
|
||||
We'll use the green taxi trips from October 2019:
|
||||
|
||||
```bash
|
||||
wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-10.csv.gz
|
||||
```
|
||||
|
||||
You will also need the dataset with zones:
|
||||
|
||||
```bash
|
||||
wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv
|
||||
```
|
||||
|
||||
Download this data and put it into Postgres.
|
||||
|
||||
You can use the code from the course. It's up to you whether
|
||||
you want to use Jupyter or a python script.
|
||||
|
||||
## Question 3. Trip Segmentation Count
|
||||
|
||||
During the period of October 1st 2019 (inclusive) and November 1st 2019 (exclusive), how many trips, **respectively**, happened:
|
||||
1. Up to 1 mile
|
||||
2. In between 1 (exclusive) and 3 miles (inclusive),
|
||||
3. In between 3 (exclusive) and 7 miles (inclusive),
|
||||
4. In between 7 (exclusive) and 10 miles (inclusive),
|
||||
5. Over 10 miles
|
||||
|
||||
Answers:
|
||||
|
||||
- 104,793; 197,670; 110,612; 27,831; 35,281
|
||||
- 104,793; 198,924; 109,603; 27,678; 35,189
|
||||
- 101,056; 201,407; 110,612; 27,831; 35,281
|
||||
- 101,056; 202,661; 109,603; 27,678; 35,189
|
||||
- 104,838; 199,013; 109,645; 27,688; 35,202
|
||||
|
||||
|
||||
## Question 4. Longest trip for each day
|
||||
|
||||
Which was the pick up day with the longest trip distance?
|
||||
Use the pick up time for your calculations.
|
||||
|
||||
Tip: For every day, we only care about one single trip with the longest distance.
|
||||
|
||||
- 2019-10-11
|
||||
- 2019-10-24
|
||||
- 2019-10-26
|
||||
- 2019-10-31
|
||||
|
||||
|
||||
## Question 5. Three biggest pickup zones
|
||||
|
||||
Which were the top pickup locations with over 13,000 in
|
||||
`total_amount` (across all trips) for 2019-10-18?
|
||||
|
||||
Consider only `lpep_pickup_datetime` when filtering by date.
|
||||
|
||||
- East Harlem North, East Harlem South, Morningside Heights
|
||||
- East Harlem North, Morningside Heights
|
||||
- Morningside Heights, Astoria Park, East Harlem South
|
||||
- Bedford, East Harlem North, Astoria Park
|
||||
|
||||
|
||||
## Question 6. Largest tip
|
||||
|
||||
For the passengers picked up in Ocrober 2019 in the zone
|
||||
name "East Harlem North" which was the drop off zone that had
|
||||
the largest tip?
|
||||
|
||||
Note: it's `tip` , not `trip`
|
||||
|
||||
We need the name of the zone, not the ID.
|
||||
|
||||
- Yorkville West
|
||||
- JFK Airport
|
||||
- East Harlem North
|
||||
- East Harlem South
|
||||
|
||||
|
||||
## Terraform
|
||||
|
||||
In this section homework we'll prepare the environment by creating resources in GCP with Terraform.
|
||||
|
||||
In your VM on GCP/Laptop/GitHub Codespace install Terraform.
|
||||
Copy the files from the course repo
|
||||
[here](../../../01-docker-terraform/1_terraform_gcp/terraform) to your VM/Laptop/GitHub Codespace.
|
||||
|
||||
Modify the files as necessary to create a GCP Bucket and Big Query Dataset.
|
||||
|
||||
|
||||
## Question 7. Terraform Workflow
|
||||
|
||||
Which of the following sequences, **respectively**, describes the workflow for:
|
||||
1. Downloading the provider plugins and setting up backend,
|
||||
2. Generating proposed changes and auto-executing the plan
|
||||
3. Remove all resources managed by terraform`
|
||||
|
||||
Answers:
|
||||
- terraform import, terraform apply -y, terraform destroy
|
||||
- teraform init, terraform plan -auto-apply, terraform rm
|
||||
- terraform init, terraform run -auto-aprove, terraform destroy
|
||||
- terraform init, terraform apply -auto-aprove, terraform destroy
|
||||
- terraform import, terraform apply -y, terraform rm
|
||||
|
||||
|
||||
## Submitting the solutions
|
||||
|
||||
* Form for submitting: https://courses.datatalks.club/de-zoomcamp-2025/homework/hw1
|
||||
|
||||
```
|
||||
docker run -it \
|
||||
-e POSTGRES_USER="postgres" \
|
||||
-e POSTGRES_PASSWORD="postres" \
|
||||
-e POSTGRES_DB="ny_taxi" \
|
||||
-v dtc_postgres_volume_local:/var/lib/postgresql/data \
|
||||
-p 5432:5432 \
|
||||
—network=pg-network \
|
||||
—name pg-database \
|
||||
postgres:17
|
||||
```
|
||||
98
cohorts/2025/02-workflow-orchestration/homework.md
Normal file
98
cohorts/2025/02-workflow-orchestration/homework.md
Normal file
@ -0,0 +1,98 @@
|
||||
## Module 2 Homework (DRAFT)
|
||||
|
||||
ATTENTION: At the end of the submission form, you will be required to include a link to your GitHub repository or other public code-hosting site. This repository should contain your code for solving the homework. If your solution includes code that is not in file format, please include these directly in the README file of your repository.
|
||||
|
||||
> In case you don't get one option exactly, select the closest one
|
||||
|
||||
For the homework, we'll be working with the _green_ taxi dataset located here:
|
||||
|
||||
`https://github.com/DataTalksClub/nyc-tlc-data/releases/tag/green/download`
|
||||
|
||||
To get a `wget`-able link, use this prefix (note that the link itself gives 404):
|
||||
|
||||
`https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/`
|
||||
|
||||
### Assignment
|
||||
|
||||
The goal will be to construct an ETL pipeline that loads the data, performs some transformations, and writes the data to a database (and Google Cloud!).
|
||||
|
||||
- Create a new pipeline, call it `green_taxi_etl`
|
||||
- Add a data loader block and use Pandas to read data for the final quarter of 2020 (months `10`, `11`, `12`).
|
||||
- You can use the same datatypes and date parsing methods shown in the course.
|
||||
- `BONUS`: load the final three months using a for loop and `pd.concat`
|
||||
- Add a transformer block and perform the following:
|
||||
- Remove rows where the passenger count is equal to 0 _and_ the trip distance is equal to zero.
|
||||
- Create a new column `lpep_pickup_date` by converting `lpep_pickup_datetime` to a date.
|
||||
- Rename columns in Camel Case to Snake Case, e.g. `VendorID` to `vendor_id`.
|
||||
- Add three assertions:
|
||||
- `vendor_id` is one of the existing values in the column (currently)
|
||||
- `passenger_count` is greater than 0
|
||||
- `trip_distance` is greater than 0
|
||||
- Using a Postgres data exporter (SQL or Python), write the dataset to a table called `green_taxi` in a schema `mage`. Replace the table if it already exists.
|
||||
- Write your data as Parquet files to a bucket in GCP, partioned by `lpep_pickup_date`. Use the `pyarrow` library!
|
||||
- Schedule your pipeline to run daily at 5AM UTC.
|
||||
|
||||
### Questions
|
||||
|
||||
## Question 1. Data Loading
|
||||
|
||||
Once the dataset is loaded, what's the shape of the data?
|
||||
|
||||
* 266,855 rows x 20 columns
|
||||
* 544,898 rows x 18 columns
|
||||
* 544,898 rows x 20 columns
|
||||
* 133,744 rows x 20 columns
|
||||
|
||||
## Question 2. Data Transformation
|
||||
|
||||
Upon filtering the dataset where the passenger count is greater than 0 _and_ the trip distance is greater than zero, how many rows are left?
|
||||
|
||||
* 544,897 rows
|
||||
* 266,855 rows
|
||||
* 139,370 rows
|
||||
* 266,856 rows
|
||||
|
||||
## Question 3. Data Transformation
|
||||
|
||||
Which of the following creates a new column `lpep_pickup_date` by converting `lpep_pickup_datetime` to a date?
|
||||
|
||||
* `data = data['lpep_pickup_datetime'].date`
|
||||
* `data('lpep_pickup_date') = data['lpep_pickup_datetime'].date`
|
||||
* `data['lpep_pickup_date'] = data['lpep_pickup_datetime'].dt.date`
|
||||
* `data['lpep_pickup_date'] = data['lpep_pickup_datetime'].dt().date()`
|
||||
|
||||
## Question 4. Data Transformation
|
||||
|
||||
What are the existing values of `VendorID` in the dataset?
|
||||
|
||||
* 1, 2, or 3
|
||||
* 1 or 2
|
||||
* 1, 2, 3, 4
|
||||
* 1
|
||||
|
||||
## Question 5. Data Transformation
|
||||
|
||||
How many columns need to be renamed to snake case?
|
||||
|
||||
* 3
|
||||
* 6
|
||||
* 2
|
||||
* 4
|
||||
|
||||
## Question 6. Data Exporting
|
||||
|
||||
Once exported, how many partitions (folders) are present in Google Cloud?
|
||||
|
||||
* 96
|
||||
* 56
|
||||
* 67
|
||||
* 108
|
||||
|
||||
## Submitting the solutions
|
||||
|
||||
* Form for submitting: https://courses.datatalks.club/de-zoomcamp-2024/homework/hw2
|
||||
* Check the link above to see the due date
|
||||
|
||||
## Solution
|
||||
|
||||
Will be added after the due date
|
||||
86
cohorts/2025/03-data-warehouse/homework.md
Normal file
86
cohorts/2025/03-data-warehouse/homework.md
Normal file
@ -0,0 +1,86 @@
|
||||
## Module 3 Homework (DRAFT)
|
||||
|
||||
Solution: https://www.youtube.com/watch?v=8g_lRKaC9ro
|
||||
|
||||
ATTENTION: At the end of the submission form, you will be required to include a link to your GitHub repository or other public code-hosting site. This repository should contain your code for solving the homework. If your solution includes code that is not in file format (such as SQL queries or shell commands), please include these directly in the README file of your repository.
|
||||
|
||||
<b><u>Important Note:</b></u> <p> For this homework we will be using the 2022 Green Taxi Trip Record Parquet Files from the New York
|
||||
City Taxi Data found here: </br> https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page </br>
|
||||
If you are using orchestration such as Mage, Airflow or Prefect do not load the data into Big Query using the orchestrator.</br>
|
||||
Stop with loading the files into a bucket. </br></br>
|
||||
<u>NOTE:</u> You will need to use the PARQUET option files when creating an External Table</br>
|
||||
|
||||
<b>SETUP:</b></br>
|
||||
Create an external table using the Green Taxi Trip Records Data for 2022. </br>
|
||||
Create a table in BQ using the Green Taxi Trip Records for 2022 (do not partition or cluster this table). </br>
|
||||
</p>
|
||||
|
||||
## Question 1:
|
||||
Question 1: What is count of records for the 2022 Green Taxi Data??
|
||||
- 65,623,481
|
||||
- 840,402
|
||||
- 1,936,423
|
||||
- 253,647
|
||||
|
||||
## Question 2:
|
||||
Write a query to count the distinct number of PULocationIDs for the entire dataset on both the tables.</br>
|
||||
What is the estimated amount of data that will be read when this query is executed on the External Table and the Table?
|
||||
|
||||
- 0 MB for the External Table and 6.41MB for the Materialized Table
|
||||
- 18.82 MB for the External Table and 47.60 MB for the Materialized Table
|
||||
- 0 MB for the External Table and 0MB for the Materialized Table
|
||||
- 2.14 MB for the External Table and 0MB for the Materialized Table
|
||||
|
||||
|
||||
## Question 3:
|
||||
How many records have a fare_amount of 0?
|
||||
- 12,488
|
||||
- 128,219
|
||||
- 112
|
||||
- 1,622
|
||||
|
||||
## Question 4:
|
||||
What is the best strategy to make an optimized table in Big Query if your query will always order the results by PUlocationID and filter based on lpep_pickup_datetime? (Create a new table with this strategy)
|
||||
- Cluster on lpep_pickup_datetime Partition by PUlocationID
|
||||
- Partition by lpep_pickup_datetime Cluster on PUlocationID
|
||||
- Partition by lpep_pickup_datetime and Partition by PUlocationID
|
||||
- Cluster on by lpep_pickup_datetime and Cluster on PUlocationID
|
||||
|
||||
## Question 5:
|
||||
Write a query to retrieve the distinct PULocationID between lpep_pickup_datetime
|
||||
06/01/2022 and 06/30/2022 (inclusive)</br>
|
||||
|
||||
Use the materialized table you created earlier in your from clause and note the estimated bytes. Now change the table in the from clause to the partitioned table you created for question 4 and note the estimated bytes processed. What are these values? </br>
|
||||
|
||||
Choose the answer which most closely matches.</br>
|
||||
|
||||
- 22.82 MB for non-partitioned table and 647.87 MB for the partitioned table
|
||||
- 12.82 MB for non-partitioned table and 1.12 MB for the partitioned table
|
||||
- 5.63 MB for non-partitioned table and 0 MB for the partitioned table
|
||||
- 10.31 MB for non-partitioned table and 10.31 MB for the partitioned table
|
||||
|
||||
|
||||
## Question 6:
|
||||
Where is the data stored in the External Table you created?
|
||||
|
||||
- Big Query
|
||||
- GCP Bucket
|
||||
- Big Table
|
||||
- Container Registry
|
||||
|
||||
|
||||
## Question 7:
|
||||
It is best practice in Big Query to always cluster your data:
|
||||
- True
|
||||
- False
|
||||
|
||||
|
||||
## (Bonus: Not worth points) Question 8:
|
||||
No Points: Write a `SELECT count(*)` query FROM the materialized table you created. How many bytes does it estimate will be read? Why?
|
||||
|
||||
|
||||
## Submitting the solutions
|
||||
|
||||
* Form for submitting: https://courses.datatalks.club/de-zoomcamp-2024/homework/hw3
|
||||
|
||||
|
||||
81
cohorts/2025/04-analytics-engineering/homework.md
Normal file
81
cohorts/2025/04-analytics-engineering/homework.md
Normal file
@ -0,0 +1,81 @@
|
||||
## Module 4 Homework (DRAFT)
|
||||
|
||||
In this homework, we'll use the models developed during the week 4 videos and enhance the already presented dbt project using the already loaded Taxi data for fhv vehicles for year 2019 in our DWH.
|
||||
|
||||
This means that in this homework we use the following data [Datasets list](https://github.com/DataTalksClub/nyc-tlc-data/)
|
||||
* Yellow taxi data - Years 2019 and 2020
|
||||
* Green taxi data - Years 2019 and 2020
|
||||
* fhv data - Year 2019.
|
||||
|
||||
We will use the data loaded for:
|
||||
|
||||
* Building a source table: `stg_fhv_tripdata`
|
||||
* Building a fact table: `fact_fhv_trips`
|
||||
* Create a dashboard
|
||||
|
||||
If you don't have access to GCP, you can do this locally using the ingested data from your Postgres database
|
||||
instead. If you have access to GCP, you don't need to do it for local Postgres - only if you want to.
|
||||
|
||||
> **Note**: if your answer doesn't match exactly, select the closest option
|
||||
|
||||
### Question 1:
|
||||
|
||||
**What happens when we execute dbt build --vars '{'is_test_run':'true'}'**
|
||||
You'll need to have completed the ["Build the first dbt models"](https://www.youtube.com/watch?v=UVI30Vxzd6c) video.
|
||||
- It's the same as running *dbt build*
|
||||
- It applies a _limit 100_ to all of our models
|
||||
- It applies a _limit 100_ only to our staging models
|
||||
- Nothing
|
||||
|
||||
### Question 2:
|
||||
|
||||
**What is the code that our CI job will run? Where is this code coming from?**
|
||||
|
||||
- The code that has been merged into the main branch
|
||||
- The code that is behind the creation object on the dbt_cloud_pr_ schema
|
||||
- The code from any development branch that has been opened based on main
|
||||
- The code from the development branch we are requesting to merge to main
|
||||
|
||||
|
||||
### Question 3 (2 points)
|
||||
|
||||
**What is the count of records in the model fact_fhv_trips after running all dependencies with the test run variable disabled (:false)?**
|
||||
Create a staging model for the fhv data, similar to the ones made for yellow and green data. Add an additional filter for keeping only records with pickup time in year 2019.
|
||||
Do not add a deduplication step. Run this models without limits (is_test_run: false).
|
||||
|
||||
Create a core model similar to fact trips, but selecting from stg_fhv_tripdata and joining with dim_zones.
|
||||
Similar to what we've done in fact_trips, keep only records with known pickup and dropoff locations entries for pickup and dropoff locations.
|
||||
Run the dbt model without limits (is_test_run: false).
|
||||
|
||||
- 12998722
|
||||
- 22998722
|
||||
- 32998722
|
||||
- 42998722
|
||||
|
||||
### Question 4 (2 points)
|
||||
|
||||
**What is the service that had the most rides during the month of July 2019 month with the biggest amount of rides after building a tile for the fact_fhv_trips table and the fact_trips tile as seen in the videos?**
|
||||
|
||||
Create a dashboard with some tiles that you find interesting to explore the data. One tile should show the amount of trips per month, as done in the videos for fact_trips, including the fact_fhv_trips data.
|
||||
|
||||
- FHV
|
||||
- Green
|
||||
- Yellow
|
||||
- FHV and Green
|
||||
|
||||
|
||||
## Submitting the solutions
|
||||
|
||||
* Form for submitting: https://courses.datatalks.club/de-zoomcamp-2024/homework/hw4
|
||||
|
||||
Deadline: 22 February (Thursday), 22:00 CET
|
||||
|
||||
|
||||
## Solution (To be published after deadline)
|
||||
|
||||
* Video: https://youtu.be/3OPggh5Rca8
|
||||
* Answers:
|
||||
* Question 1: It applies a _limit 100_ only to our staging models
|
||||
* Question 2: The code from the development branch we are requesting to merge to main
|
||||
* Question 3: 22998722
|
||||
* Question 4: Yellow
|
||||
100
cohorts/2025/05-batch/homework.md
Normal file
100
cohorts/2025/05-batch/homework.md
Normal file
@ -0,0 +1,100 @@
|
||||
## Module 5 Homework (DRAFT)
|
||||
|
||||
Solution: https://www.youtube.com/watch?v=YtddC7vJOgQ
|
||||
|
||||
In this homework we'll put what we learned about Spark in practice.
|
||||
|
||||
For this homework we will be using the FHV 2019-10 data found here. [FHV Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz)
|
||||
|
||||
### Question 1:
|
||||
|
||||
**Install Spark and PySpark**
|
||||
|
||||
- Install Spark
|
||||
- Run PySpark
|
||||
- Create a local spark session
|
||||
- Execute spark.version.
|
||||
|
||||
What's the output?
|
||||
|
||||
> [!NOTE]
|
||||
> To install PySpark follow this [guide](https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/05-batch/setup/pyspark.md)
|
||||
|
||||
### Question 2:
|
||||
|
||||
**FHV October 2019**
|
||||
|
||||
Read the October 2019 FHV into a Spark Dataframe with a schema as we did in the lessons.
|
||||
|
||||
Repartition the Dataframe to 6 partitions and save it to parquet.
|
||||
|
||||
What is the average size of the Parquet (ending with .parquet extension) Files that were created (in MB)? Select the answer which most closely matches.
|
||||
|
||||
- 1MB
|
||||
- 6MB
|
||||
- 25MB
|
||||
- 87MB
|
||||
|
||||
|
||||
|
||||
### Question 3:
|
||||
|
||||
**Count records**
|
||||
|
||||
How many taxi trips were there on the 15th of October?
|
||||
|
||||
Consider only trips that started on the 15th of October.
|
||||
|
||||
- 108,164
|
||||
- 12,856
|
||||
- 452,470
|
||||
- 62,610
|
||||
|
||||
> [!IMPORTANT]
|
||||
> Be aware of columns order when defining schema
|
||||
|
||||
### Question 4:
|
||||
|
||||
**Longest trip for each day**
|
||||
|
||||
What is the length of the longest trip in the dataset in hours?
|
||||
|
||||
- 631,152.50 Hours
|
||||
- 243.44 Hours
|
||||
- 7.68 Hours
|
||||
- 3.32 Hours
|
||||
|
||||
|
||||
|
||||
### Question 5:
|
||||
|
||||
**User Interface**
|
||||
|
||||
Spark’s User Interface which shows the application's dashboard runs on which local port?
|
||||
|
||||
- 80
|
||||
- 443
|
||||
- 4040
|
||||
- 8080
|
||||
|
||||
|
||||
|
||||
### Question 6:
|
||||
|
||||
**Least frequent pickup location zone**
|
||||
|
||||
Load the zone lookup data into a temp view in Spark</br>
|
||||
[Zone Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv)
|
||||
|
||||
Using the zone lookup data and the FHV October 2019 data, what is the name of the LEAST frequent pickup location Zone?</br>
|
||||
|
||||
- East Chelsea
|
||||
- Jamaica Bay
|
||||
- Union Sq
|
||||
- Crown Heights North
|
||||
|
||||
|
||||
## Submitting the solutions
|
||||
|
||||
- Form for submitting: https://courses.datatalks.club/de-zoomcamp-2024/homework/hw5
|
||||
- Deadline: See the website
|
||||
318
cohorts/2025/06-streaming/homework.md
Normal file
318
cohorts/2025/06-streaming/homework.md
Normal file
@ -0,0 +1,318 @@
|
||||
## Module 6 Homework (DRAFT)
|
||||
|
||||
In this homework, we're going to extend Module 5 Homework and learn about streaming with PySpark.
|
||||
|
||||
Instead of Kafka, we will use Red Panda, which is a drop-in
|
||||
replacement for Kafka.
|
||||
|
||||
Ensure you have the following set up (if you had done the previous homework and the module):
|
||||
|
||||
- Docker (see [module 1](https://github.com/DataTalksClub/data-engineering-zoomcamp/tree/main/01-docker-terraform))
|
||||
- PySpark (see [module 5](https://github.com/DataTalksClub/data-engineering-zoomcamp/tree/main/05-batch/setup))
|
||||
|
||||
For this homework we will be using the files from Module 5 homework:
|
||||
|
||||
- Green 2019-10 data from [here](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-10.csv.gz)
|
||||
|
||||
|
||||
|
||||
## Start Red Panda
|
||||
|
||||
Let's start redpanda in a docker container.
|
||||
|
||||
There's a `docker-compose.yml` file in the homework folder (taken from [here](https://github.com/redpanda-data-blog/2023-python-gsg/blob/main/docker-compose.yml))
|
||||
|
||||
Copy this file to your homework directory and run
|
||||
|
||||
```bash
|
||||
docker-compose up
|
||||
```
|
||||
|
||||
(Add `-d` if you want to run in detached mode)
|
||||
|
||||
|
||||
## Question 1: Redpanda version
|
||||
|
||||
Now let's find out the version of redpandas.
|
||||
|
||||
For that, check the output of the command `rpk help` _inside the container_. The name of the container is `redpanda-1`.
|
||||
|
||||
Find out what you need to execute based on the `help` output.
|
||||
|
||||
What's the version, based on the output of the command you executed? (copy the entire version)
|
||||
|
||||
|
||||
## Question 2. Creating a topic
|
||||
|
||||
Before we can send data to the redpanda server, we
|
||||
need to create a topic. We do it also with the `rpk`
|
||||
command we used previously for figuring out the version of
|
||||
redpandas.
|
||||
|
||||
Read the output of `help` and based on it, create a topic with name `test-topic`
|
||||
|
||||
What's the output of the command for creating a topic? Include the entire output in your answer.
|
||||
|
||||
|
||||
## Question 3. Connecting to the Kafka server
|
||||
|
||||
We need to make sure we can connect to the server, so
|
||||
later we can send some data to its topics
|
||||
|
||||
First, let's install the kafka connector (up to you if you
|
||||
want to have a separate virtual environment for that)
|
||||
|
||||
```bash
|
||||
pip install kafka-python
|
||||
```
|
||||
|
||||
You can start a jupyter notebook in your solution folder or
|
||||
create a script
|
||||
|
||||
Let's try to connect to our server:
|
||||
|
||||
```python
|
||||
import json
|
||||
import time
|
||||
|
||||
from kafka import KafkaProducer
|
||||
|
||||
def json_serializer(data):
|
||||
return json.dumps(data).encode('utf-8')
|
||||
|
||||
server = 'localhost:9092'
|
||||
|
||||
producer = KafkaProducer(
|
||||
bootstrap_servers=[server],
|
||||
value_serializer=json_serializer
|
||||
)
|
||||
|
||||
producer.bootstrap_connected()
|
||||
```
|
||||
|
||||
Provided that you can connect to the server, what's the output
|
||||
of the last command?
|
||||
|
||||
|
||||
## Question 4. Sending data to the stream
|
||||
|
||||
Now we're ready to send some test data:
|
||||
|
||||
```python
|
||||
t0 = time.time()
|
||||
|
||||
topic_name = 'test-topic'
|
||||
|
||||
for i in range(10):
|
||||
message = {'number': i}
|
||||
producer.send(topic_name, value=message)
|
||||
print(f"Sent: {message}")
|
||||
time.sleep(0.05)
|
||||
|
||||
producer.flush()
|
||||
|
||||
t1 = time.time()
|
||||
print(f'took {(t1 - t0):.2f} seconds')
|
||||
```
|
||||
|
||||
How much time did it take? Where did it spend most of the time?
|
||||
|
||||
* Sending the messages
|
||||
* Flushing
|
||||
* Both took approximately the same amount of time
|
||||
|
||||
(Don't remove `time.sleep` when answering this question)
|
||||
|
||||
|
||||
## Reading data with `rpk`
|
||||
|
||||
You can see the messages that you send to the topic
|
||||
with `rpk`:
|
||||
|
||||
```bash
|
||||
rpk topic consume test-topic
|
||||
```
|
||||
|
||||
Run the command above and send the messages one more time to
|
||||
see them
|
||||
|
||||
|
||||
## Sending the taxi data
|
||||
|
||||
Now let's send our actual data:
|
||||
|
||||
* Read the green csv.gz file
|
||||
* We will only need these columns:
|
||||
* `'lpep_pickup_datetime',`
|
||||
* `'lpep_dropoff_datetime',`
|
||||
* `'PULocationID',`
|
||||
* `'DOLocationID',`
|
||||
* `'passenger_count',`
|
||||
* `'trip_distance',`
|
||||
* `'tip_amount'`
|
||||
|
||||
Iterate over the records in the dataframe
|
||||
|
||||
```python
|
||||
for row in df_green.itertuples(index=False):
|
||||
row_dict = {col: getattr(row, col) for col in row._fields}
|
||||
print(row_dict)
|
||||
break
|
||||
|
||||
# TODO implement sending the data here
|
||||
```
|
||||
|
||||
Note: this way of iterating over the records is more efficient compared
|
||||
to `iterrows`
|
||||
|
||||
|
||||
## Question 5: Sending the Trip Data
|
||||
|
||||
* Create a topic `green-trips` and send the data there
|
||||
* How much time in seconds did it take? (You can round it to a whole number)
|
||||
* Make sure you don't include sleeps in your code
|
||||
|
||||
|
||||
## Creating the PySpark consumer
|
||||
|
||||
Now let's read the data with PySpark.
|
||||
|
||||
Spark needs a library (jar) to be able to connect to Kafka,
|
||||
so we need to tell PySpark that it needs to use it:
|
||||
|
||||
```python
|
||||
import pyspark
|
||||
from pyspark.sql import SparkSession
|
||||
|
||||
pyspark_version = pyspark.__version__
|
||||
kafka_jar_package = f"org.apache.spark:spark-sql-kafka-0-10_2.12:{pyspark_version}"
|
||||
|
||||
spark = SparkSession \
|
||||
.builder \
|
||||
.master("local[*]") \
|
||||
.appName("GreenTripsConsumer") \
|
||||
.config("spark.jars.packages", kafka_jar_package) \
|
||||
.getOrCreate()
|
||||
```
|
||||
|
||||
Now we can connect to the stream:
|
||||
|
||||
```python
|
||||
green_stream = spark \
|
||||
.readStream \
|
||||
.format("kafka") \
|
||||
.option("kafka.bootstrap.servers", "localhost:9092") \
|
||||
.option("subscribe", "green-trips") \
|
||||
.option("startingOffsets", "earliest") \
|
||||
.load()
|
||||
```
|
||||
|
||||
In order to test that we can consume from the stream,
|
||||
let's see what will be the first record there.
|
||||
|
||||
In Spark streaming, the stream is represented as a sequence of
|
||||
small batches, each batch being a small RDD (or a small dataframe).
|
||||
|
||||
So we can execute a function over each mini-batch.
|
||||
Let's run `take(1)` there to see what do we have in the stream:
|
||||
|
||||
```python
|
||||
def peek(mini_batch, batch_id):
|
||||
first_row = mini_batch.take(1)
|
||||
|
||||
if first_row:
|
||||
print(first_row[0])
|
||||
|
||||
query = green_stream.writeStream.foreachBatch(peek).start()
|
||||
```
|
||||
|
||||
You should see a record like this:
|
||||
|
||||
```
|
||||
Row(key=None, value=bytearray(b'{"lpep_pickup_datetime": "2019-10-01 00:26:02", "lpep_dropoff_datetime": "2019-10-01 00:39:58", "PULocationID": 112, "DOLocationID": 196, "passenger_count": 1.0, "trip_distance": 5.88, "tip_amount": 0.0}'), topic='green-trips', partition=0, offset=0, timestamp=datetime.datetime(2024, 3, 12, 22, 42, 9, 411000), timestampType=0)
|
||||
```
|
||||
|
||||
Now let's stop the query, so it doesn't keep consuming messages
|
||||
from the stream
|
||||
|
||||
```python
|
||||
query.stop()
|
||||
```
|
||||
|
||||
## Question 6. Parsing the data
|
||||
|
||||
The data is JSON, but currently it's in binary format. We need
|
||||
to parse it and turn it into a streaming dataframe with proper
|
||||
columns.
|
||||
|
||||
Similarly to PySpark, we define the schema
|
||||
|
||||
```python
|
||||
from pyspark.sql import types
|
||||
|
||||
schema = types.StructType() \
|
||||
.add("lpep_pickup_datetime", types.StringType()) \
|
||||
.add("lpep_dropoff_datetime", types.StringType()) \
|
||||
.add("PULocationID", types.IntegerType()) \
|
||||
.add("DOLocationID", types.IntegerType()) \
|
||||
.add("passenger_count", types.DoubleType()) \
|
||||
.add("trip_distance", types.DoubleType()) \
|
||||
.add("tip_amount", types.DoubleType())
|
||||
```
|
||||
|
||||
And apply this schema:
|
||||
|
||||
```python
|
||||
from pyspark.sql import functions as F
|
||||
|
||||
green_stream = green_stream \
|
||||
.select(F.from_json(F.col("value").cast('STRING'), schema).alias("data")) \
|
||||
.select("data.*")
|
||||
```
|
||||
|
||||
How does the record look after parsing? Copy the output.
|
||||
|
||||
|
||||
### Question 7: Most popular destination
|
||||
|
||||
Now let's finally do some streaming analytics. We will
|
||||
see what's the most popular destination currently
|
||||
based on our stream of data (which ideally we should
|
||||
have sent with delays like we did in workshop 2)
|
||||
|
||||
|
||||
This is how you can do it:
|
||||
|
||||
* Add a column "timestamp" using the `current_timestamp` function
|
||||
* Group by:
|
||||
* 5 minutes window based on the timestamp column (`F.window(col("timestamp"), "5 minutes")`)
|
||||
* `"DOLocationID"`
|
||||
* Order by count
|
||||
|
||||
You can print the output to the console using this
|
||||
code
|
||||
|
||||
```python
|
||||
query = popular_destinations \
|
||||
.writeStream \
|
||||
.outputMode("complete") \
|
||||
.format("console") \
|
||||
.option("truncate", "false") \
|
||||
.start()
|
||||
|
||||
query.awaitTermination()
|
||||
```
|
||||
|
||||
Write the most popular destination, your answer should be *either* the zone ID or the zone name of this destination. (You will need to re-send the data for this to work)
|
||||
|
||||
|
||||
## Submitting the solutions
|
||||
|
||||
* Form for submitting: https://courses.datatalks.club/de-zoomcamp-2024/homework/hw6
|
||||
|
||||
|
||||
## Solution
|
||||
|
||||
We will publish the solution here after deadline.
|
||||
|
||||
|
||||
48
cohorts/2025/README.md
Normal file
48
cohorts/2025/README.md
Normal file
@ -0,0 +1,48 @@
|
||||
## Data Engineering Zoomcamp 2025 Cohort
|
||||
|
||||
* [Pre-launch Q&A stream](TBA)
|
||||
* [Launch stream with course overview](TBA)
|
||||
* [Course Google calendar](https://calendar.google.com/calendar/?cid=ZXIxcjA1M3ZlYjJpcXU0dTFmaG02MzVxMG9AZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ)
|
||||
* [FAQ](https://docs.google.com/document/d/19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw/edit?usp=sharing)
|
||||
* Course Playlist: Only 2024 Live videos & homeworks (TODO)
|
||||
|
||||
|
||||
[**Module 1: Introduction & Prerequisites**](01-docker-terraform/)
|
||||
|
||||
* [Homework](01-docker-terraform/homework.md)
|
||||
|
||||
|
||||
[**Module 2: Workflow Orchestration**](02-workflow-orchestration)
|
||||
|
||||
* [Homework](02-workflow-orchestration/homework.md)
|
||||
* Office hours
|
||||
|
||||
[**Workshop 1: Data Ingestion**](workshops/dlt.md)
|
||||
|
||||
* Workshop with dlt
|
||||
* [Homework](workshops/dlt.md)
|
||||
|
||||
|
||||
[**Module 3: Data Warehouse**](03-data-warehouse)
|
||||
|
||||
* [Homework](03-data-warehouse/homework.md)
|
||||
|
||||
|
||||
[**Module 4: Analytics Engineering**](04-analytics-engineering/)
|
||||
|
||||
* [Homework](04-analytics-engineering/homework.md)
|
||||
|
||||
|
||||
[**Module 5: Batch processing**](05-batch/)
|
||||
|
||||
* [Homework](05-batch/homework.md)
|
||||
|
||||
|
||||
[**Module 6: Stream Processing**](06-streaming)
|
||||
|
||||
* [Homework](06-streaming/homework.md)
|
||||
|
||||
|
||||
[**Project**](project.md)
|
||||
|
||||
More information [here](project.md)
|
||||
43
cohorts/2025/project.md
Normal file
43
cohorts/2025/project.md
Normal file
@ -0,0 +1,43 @@
|
||||
## Course Project
|
||||
|
||||
The goal of this project is to apply everything we learned
|
||||
in this course and build an end-to-end data pipeline.
|
||||
|
||||
You will have two attempts to submit your project. If you don't have
|
||||
time to submit your project by the end of attempt #1 (you started the
|
||||
course late, you have vacation plans, life/work got in the way, etc.)
|
||||
or you fail your first attempt,
|
||||
then you will have a second chance to submit your project as attempt
|
||||
#2.
|
||||
|
||||
There are only two attempts.
|
||||
|
||||
Remember that to pass the project, you must evaluate 3 peers. If you don't do that,
|
||||
your project can't be considered complete.
|
||||
|
||||
To find the projects assigned to you, use the peer review assignments link
|
||||
and find your hash in the first column. You will see three rows: you need to evaluate
|
||||
each of these projects. For each project, you need to submit the form once,
|
||||
so in total, you will make three submissions.
|
||||
|
||||
|
||||
### Submitting
|
||||
|
||||
#### Project Attempt #1
|
||||
|
||||
* Project: https://courses.datatalks.club/de-zoomcamp-2024/project/project1
|
||||
* Review: https://courses.datatalks.club/de-zoomcamp-2024/project/project1/eval
|
||||
|
||||
#### Project Attempt #2
|
||||
|
||||
* Project: https://courses.datatalks.club/de-zoomcamp-2024/project/project2
|
||||
* Review: https://courses.datatalks.club/de-zoomcamp-2024/project/project2/eval
|
||||
|
||||
> **Important**: update your "Certificate name" here: https://courses.datatalks.club/de-zoomcamp-2024/enrollment -
|
||||
this is what we will use when generating certificates for you.
|
||||
|
||||
### Evaluation criteria
|
||||
|
||||
See [here](../../week_7_project/README.md)
|
||||
|
||||
|
||||
5
cohorts/2025/workshops/dlt.md
Normal file
5
cohorts/2025/workshops/dlt.md
Normal file
@ -0,0 +1,5 @@
|
||||
# Data ingestion with dlt
|
||||
|
||||
Sign up here: https://lu.ma/quyfn4q8 (optional)
|
||||
|
||||
Details TBA
|
||||
BIN
images/architecture/arch_v4_workshops.jpg
Normal file
BIN
images/architecture/arch_v4_workshops.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 119 KiB |
18
images/kestra.svg
Normal file
18
images/kestra.svg
Normal file
@ -0,0 +1,18 @@
|
||||
<svg width="233" height="68" viewBox="0 0 233 68" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<rect x="0.901123" width="232" height="68" rx="10" fill="#FBFBFB"/>
|
||||
<path d="M76.5176 15.4844H68.3245V52.0398H76.5176V44.759L79.5817 41.6282L86.4414 52.0398H95.7628L85.1438 35.969L95.3646 25.5348H85.2477L76.5176 34.6997V15.4844Z" fill="#26282D"/>
|
||||
<path d="M161.375 17.419H153.278V25.5348H148.585V32.0804H153.278V46.0303C153.278 47.9809 153.784 49.4988 154.85 50.5263C155.914 51.5529 157.485 52.0398 159.505 52.0398H166.597V45.6358H161.375V32.0804H166.597V25.5348H161.375V17.419Z" fill="#26282D"/>
|
||||
<path fill-rule="evenodd" clip-rule="evenodd" d="M108.268 25.063C104.269 25.063 101.032 26.2737 98.591 28.7155L98.5889 28.7177C96.1783 31.178 94.9898 34.5473 94.9898 38.7873C94.9898 42.9802 96.1622 46.3323 98.5386 48.8076C100.919 51.2869 104.208 52.5117 108.364 52.5117C111.75 52.5117 114.607 51.7291 116.915 50.1453L116.916 50.1441L116.918 50.1429C119.225 48.5115 120.665 46.3552 121.226 43.6844L121.302 43.3237H113.268L113.199 43.5304C112.92 44.3664 112.354 45.0428 111.48 45.5605C110.622 46.0618 109.572 46.32 108.316 46.32C106.787 46.32 105.582 45.9055 104.671 45.1005C103.762 44.2819 103.303 43.2171 103.303 41.878V40.7861H120.814L120.845 40.5211C120.975 39.4405 121.04 38.3828 121.04 37.3482C121.04 33.5575 119.871 30.5543 117.504 28.3784C115.19 26.1597 112.1 25.063 108.268 25.063ZM104.617 32.3848C105.496 31.6405 106.67 31.2547 108.171 31.2547C109.624 31.2547 110.766 31.6321 111.63 32.3615L111.632 32.3633L111.634 32.3651C112.507 33.076 112.943 34.0042 112.943 35.1777V35.3494H103.303V35.272C103.303 34.0644 103.742 33.1123 104.615 32.3867L104.617 32.3848Z" fill="#26282D"/>
|
||||
<path fill-rule="evenodd" clip-rule="evenodd" d="M135.174 25.063C131.745 25.063 128.993 25.8116 126.95 27.3424L126.949 27.3431C124.91 28.8833 123.893 31.0193 123.893 33.7149C123.893 35.8196 124.566 37.5446 125.928 38.8596C127.283 40.1673 129.292 41.0424 131.918 41.5126L131.921 41.5131L136.62 42.2691L136.624 42.2697C137.413 42.3758 138.001 42.5845 138.412 42.876L138.414 42.8774L138.416 42.8788C138.791 43.1366 139.05 43.6198 139.127 44.4161C139.124 45.1308 138.834 45.6701 138.249 46.0664L138.246 46.0684C137.661 46.4742 136.787 46.6975 135.583 46.6975C134.261 46.6975 133.269 46.4153 132.57 45.888C131.903 45.3721 131.557 44.6325 131.557 43.6238V43.3237H123.484V43.6238C123.484 46.3267 124.57 48.4997 126.734 50.1146L126.734 50.1153C128.906 51.7244 131.851 52.5117 135.535 52.5117C139.138 52.5117 142.018 51.7482 144.141 50.1868C146.293 48.6166 147.369 46.433 147.369 43.671C147.369 41.4958 146.734 39.7371 145.435 38.4304C144.141 37.1294 142.223 36.3106 139.722 35.9428L139.719 35.9425L135.171 35.2112L135.168 35.2107C134.603 35.1338 134.217 35.0798 134.007 35.0484C133.841 35.0062 133.622 34.9282 133.349 34.809C133.092 34.6974 132.838 34.5345 132.585 34.3159C132.225 34.0019 132.038 33.5842 132.038 33.0307C132.038 32.3299 132.297 31.8189 132.803 31.4565C133.325 31.0831 134.087 30.8772 135.126 30.8772C136.318 30.8772 137.165 31.1171 137.714 31.5486C138.252 31.9712 138.551 32.6147 138.574 33.5334L138.581 33.8263H146.632L146.622 33.5167C146.539 30.9288 145.471 28.8613 143.421 27.3419C141.378 25.8113 138.617 25.063 135.174 25.063ZM143.958 49.9467L144.141 50.1868L144.141 50.1874L143.958 49.9467ZM132.382 46.1246L132.569 45.887L132.57 45.888L132.571 45.8888L132.382 46.1246ZM139.433 44.4024L139.129 44.4301L139.127 44.4161L139.127 44.4024H139.433Z" fill="#26282D"/>
|
||||
<path fill-rule="evenodd" clip-rule="evenodd" d="M199.96 25.063C196.386 25.063 193.491 25.8514 191.305 27.4585C189.124 29.0584 187.932 31.3451 187.718 34.2833L187.694 34.6048H195.696L195.717 34.3271C195.792 33.3391 196.155 32.6117 196.785 32.107L196.785 32.1065C197.434 31.5832 198.367 31.3019 199.623 31.3019C200.887 31.3019 201.778 31.5985 202.353 32.1387C202.927 32.6765 203.24 33.5051 203.24 34.6822V35.892H196.567C193.548 35.892 191.12 36.6284 189.317 38.1342C187.522 39.6304 186.634 41.6851 186.634 44.2608C186.634 46.7498 187.42 48.7673 189.013 50.2802L189.016 50.2824C190.625 51.7776 192.768 52.5117 195.411 52.5117C197.427 52.5117 199.211 52.1098 200.755 51.2962L200.757 51.2951C202.162 50.5399 203.159 49.5447 203.725 48.3057L203.797 48.7443L203.798 48.7484C204 49.8729 204.481 50.7321 205.271 51.2773L205.273 51.2786L205.275 51.2799C206.057 51.803 207.24 52.0398 208.768 52.0398H214.61V46.32H211.361V34.8238C211.361 32.124 210.443 29.8163 208.607 27.9184C206.791 25.9917 203.883 25.063 199.96 25.063ZM194.851 43.1284C194.851 42.3709 195.11 41.8447 195.598 41.495C196.104 41.1326 196.9 40.9277 198.035 40.9277H203.24V41.5477C203.24 43.0544 202.774 44.2211 201.866 45.0824C200.957 45.9286 199.709 46.3672 198.083 46.3672C196.932 46.3672 196.125 46.1566 195.612 45.7833C195.116 45.4225 194.851 44.8784 194.851 44.0957V43.1284Z" fill="#26282D"/>
|
||||
<path d="M177.217 25.5348C174.438 25.5348 172.503 26.2508 171.586 27.8082C171.188 28.474 170.911 29.1308 170.758 29.7782L170.757 29.7811L170.757 29.7839C170.623 30.4086 170.558 31.1701 170.558 32.0634V52.0398H178.655V32.0804H186.5V25.5348H177.217Z" fill="#26282D"/>
|
||||
<path d="M219.974 52.0637C219.102 52.0637 218.379 51.7923 217.805 51.2495C217.254 50.6831 216.979 49.9868 216.979 49.1608C216.979 48.3347 217.254 47.6503 217.805 47.1075C218.379 46.541 219.102 46.2578 219.974 46.2578C220.824 46.2578 221.524 46.541 222.075 47.1075C222.626 47.6503 222.901 48.3347 222.901 49.1608C222.901 49.9868 222.626 50.6831 222.075 51.2495C221.524 51.7923 220.824 52.0637 219.974 52.0637Z" fill="#F62E77"/>
|
||||
<path d="M10.9011 15.3676C10.9011 12.4032 13.3525 10 16.3765 10H54.4038C57.4277 10 59.8791 12.4032 59.8791 15.3676V52.6467C59.8791 55.6111 57.4277 58.0143 54.4038 58.0143H16.3765C13.3525 58.0143 10.9011 55.6111 10.9011 52.6467V15.3676Z" fill="#2C0059"/>
|
||||
<path d="M34.1873 30.1515C34.8515 29.5004 35.9285 29.5004 36.5927 30.1515L39.3231 32.8282C39.9873 33.4793 39.9873 34.5351 39.3231 35.1862L36.5927 37.8629C35.9285 38.5141 34.8515 38.5141 34.1873 37.8629L31.4569 35.1862C30.7927 34.5351 30.7927 33.4793 31.4569 32.8282L34.1873 30.1515Z" fill="#A950FF"/>
|
||||
<path d="M46.763 30.1302C47.4154 29.4907 48.473 29.4907 49.1254 30.1302L51.8988 32.849C52.5512 33.4885 52.5512 34.5254 51.8988 35.1649L49.1254 37.8837C48.473 38.5233 47.4154 38.5233 46.763 37.8837L43.9896 35.1649C43.3373 34.5254 43.3373 33.4885 43.9896 32.849L46.763 30.1302Z" fill="#A950FF"/>
|
||||
<path d="M34.2088 17.8234C34.8612 17.1839 35.9188 17.1839 36.5712 17.8234L39.3446 20.5422C39.997 21.1817 39.997 22.2186 39.3446 22.8581L36.5712 25.5769C35.9188 26.2165 34.8612 26.2165 34.2088 25.5769L31.4354 22.8581C30.7831 22.2186 30.7831 21.1817 31.4354 20.5422L34.2088 17.8234Z" fill="#E9C1FF"/>
|
||||
<path d="M33.0466 26.6747C33.7109 27.3259 33.7109 28.3816 33.0466 29.0328L30.3162 31.7094C29.652 32.3606 28.5751 32.3606 27.9108 31.7094L25.1805 29.0328C24.5162 28.3816 24.5162 27.3259 25.1805 26.6747L27.9108 23.9981C28.5751 23.3469 29.652 23.3469 30.3162 23.9981L33.0466 26.6747Z" fill="#CD88FF"/>
|
||||
<path d="M26.7908 32.849C27.4431 33.4885 27.4431 34.5254 26.7908 35.1649L24.0174 37.8837C23.365 38.5233 22.3074 38.5233 21.655 37.8837L18.8816 35.1649C18.2292 34.5254 18.2292 33.4885 18.8816 32.849L21.655 30.1302C22.3074 29.4907 23.365 29.4907 24.0174 30.1302L26.7908 32.849Z" fill="#A950FF"/>
|
||||
<path d="M45.6003 26.6747C46.2645 27.3259 46.2645 28.3816 45.6003 29.0328L42.8699 31.7094C42.2057 32.3606 41.1288 32.3606 40.4645 31.7094L37.7342 29.0328C37.0699 28.3816 37.0699 27.3259 37.7342 26.6747L40.4645 23.9981C41.1288 23.3469 42.2057 23.3469 42.8699 23.9981L45.6003 26.6747Z" fill="#CD88FF"/>
|
||||
<path d="M38.0771 44.2537C39.5611 45.7085 39.5611 48.0671 38.0771 49.5218C36.5932 50.9766 34.1872 50.9766 32.7033 49.5218C31.2193 48.0671 31.2193 45.7085 32.7033 44.2537C34.1872 42.799 36.5932 42.799 38.0771 44.2537Z" fill="#F62E76"/>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 7.6 KiB |
37
learning-in-public.md
Normal file
37
learning-in-public.md
Normal file
@ -0,0 +1,37 @@
|
||||
# Learning in public
|
||||
|
||||
Most people learn in private: they consume content but don't tell
|
||||
anyone about it. There's nothing wrong with it.
|
||||
|
||||
But we want to encourage you to document your progress and
|
||||
share it publicly on social media.
|
||||
|
||||
It helps you get noticed and will lead to:
|
||||
|
||||
* Expanding your network: meeting new people and making new friends
|
||||
* Being invited to meetups, conferences and podcasts
|
||||
* Landing a job or getting clients
|
||||
* Many other good things
|
||||
|
||||
Here's a more compresensive reading on why you want to do it: https://github.com/readme/guides/publishing-your-work
|
||||
|
||||
|
||||
## Learning in Public for Zoomcamps
|
||||
|
||||
When you submit your homework or project, you can also submit
|
||||
learning in public posts:
|
||||
|
||||
<img src="https://github.com/DataTalksClub/mlops-zoomcamp/raw/main/images/learning-in-public-links.png" />
|
||||
|
||||
You can watch this video to see how your learning in public posts may look like:
|
||||
|
||||
<a href="https://www.loom.com/share/710e3297487b409d94df0e8da1c984ce" target="_blank">
|
||||
<img src="https://github.com/DataTalksClub/mlops-zoomcamp/raw/main/images/learning-in-public.png" height="240" />
|
||||
</a>
|
||||
|
||||
## Daily Documentation
|
||||
|
||||
- **Post Daily Diaries**: Document what you learn each day, including the challenges faced and the methods used to overcome them.
|
||||
- **Create Quick Videos**: Make short videos showcasing your work and upload them to GitHub.
|
||||
|
||||
Send a PR if you want to suggest improvements for this document
|
||||
@ -63,8 +63,8 @@ Example dashboard:  and using it for the capstone project
|
||||
* Re-using your own projects (in full or partly) from other courses and bootcamps
|
||||
* Re-using your midterm project from ML Zoomcamp in capstone
|
||||
* Re-using your ML Zoomcamp from previous iterations of the course
|
||||
|
||||
Violating any of this will result in 0 points for this project.
|
||||
|
||||
## Resources
|
||||
|
||||
### Datasets
|
||||
|
||||
Reference in New Issue
Block a user