Compare commits
10 Commits
415f417182
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| 946e4020d9 | |||
| 571c749e25 | |||
| 718407d709 | |||
| 348a074a3a | |||
| 6c70628616 | |||
| e64288694b | |||
| ee95be1031 | |||
| 5f6251114f | |||
| a49013fc0c | |||
| 3cd7be1e7b |
@@ -1,9 +1,9 @@
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
$CONTAINER = "ewc2025-mysql"
|
||||
$CONTAINER = "hotel-mysql"
|
||||
$IMAGE = "mysql:8.4"
|
||||
$ROOT_PASSWORD = "ewc2025root"
|
||||
$DATABASE = "ewc2025"
|
||||
$ROOT_PASSWORD = "hotel2025root"
|
||||
$DATABASE = "hotel_reservations"
|
||||
$PORT = "13306"
|
||||
|
||||
$SQL_DIR = Resolve-Path "$PSScriptRoot\..\sql"
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
CONTAINER="ewc2025-mysql"
|
||||
CONTAINER="hotel-mysql"
|
||||
IMAGE="mysql:8.4"
|
||||
ROOT_PASSWORD="ewc2025root"
|
||||
DATABASE="ewc2025"
|
||||
ROOT_PASSWORD="hotel2025root"
|
||||
DATABASE="hotel_reservations"
|
||||
PORT="13306"
|
||||
|
||||
RUNTIME="docker"
|
||||
@@ -26,7 +26,7 @@ else
|
||||
-e MYSQL_DATABASE="${DATABASE}" \
|
||||
-p "${PORT}:3306" \
|
||||
-v "${CONTAINER}-data:/var/lib/mysql" \
|
||||
-v "${SQL_DIR}/schema.sql:/docker-entrypoint-initdb.d/01_schema.sql:ro" \
|
||||
-v "${SQL_DIR}/schema.sql:/docker-entrypoint-initdb.d/01_schema.sql:ro,z" \
|
||||
"${IMAGE}"
|
||||
fi
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
$CONTAINER = "ewc2025-mysql"
|
||||
$CONTAINER = "hotel-mysql"
|
||||
|
||||
$exists = docker ps -a --format '{{.Names}}' | Where-Object { $_ -eq $CONTAINER }
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
CONTAINER="ewc2025-mysql"
|
||||
CONTAINER="hotel-mysql"
|
||||
|
||||
RUNTIME="docker"
|
||||
if [[ "${1:-}" == "--podman" ]]; then
|
||||
|
||||
98
docs/01-overview.md
Normal file
98
docs/01-overview.md
Normal file
@@ -0,0 +1,98 @@
|
||||
# Hotel Reservations — Data Warehouse Project
|
||||
|
||||
## Project Summary
|
||||
|
||||
This project implements a complete **Data Warehousing pipeline** for a hotel reservation system, covering all standard DW layers:
|
||||
|
||||
```
|
||||
MySQL OLTP ──► Apache NiFi ETL ──► Oracle Data Mart ──► Power BI Reports
|
||||
(source) (transform) (analytical store) (OLAP queries)
|
||||
```
|
||||
|
||||
The system is built around the **A.24 Hotel Reservations** domain from the course specification. The OLTP database was populated with **~635,000 synthetically generated rows** covering 200 hotels, 100,000 guests, 500,000 bookings, and 531,000 room bookings across a 4-year period (2022–2025).
|
||||
|
||||
---
|
||||
|
||||
## Business Context
|
||||
|
||||
A hotel chain needs to answer questions like:
|
||||
|
||||
- Which countries generate the most revenue per quarter?
|
||||
- How does occupancy differ between peak and off-peak seasons?
|
||||
- What is the revenue contribution of 5-star vs 3-star hotels?
|
||||
- How has a hotel's revenue changed after upgrading its star rating?
|
||||
|
||||
These questions require **historical, multi-dimensional analysis** that a normalized OLTP database cannot serve efficiently. The data mart provides pre-modelled, denormalized data optimized for analytical queries.
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────┐
|
||||
│ SOURCE LAYER │
|
||||
│ MySQL 8.4 (Docker/Podman, port 13306) │
|
||||
│ Database: hotel_reservations │
|
||||
│ 13 normalized tables, ~635K rows │
|
||||
└───────────────────────┬─────────────────────────────────┘
|
||||
│ JDBC (MySqlConnector)
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────┐
|
||||
│ ETL LAYER │
|
||||
│ Apache NiFi │
|
||||
│ 5 Process Groups: Date Dim / Static Dims / │
|
||||
│ SCD2 Hotel / SCD1 Guest / Incremental Fact │
|
||||
└───────────────────────┬─────────────────────────────────┘
|
||||
│ JDBC (Oracle JDBC)
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────┐
|
||||
│ DATA MART LAYER │
|
||||
│ Oracle (university lab schema) │
|
||||
│ Star schema: 6 dimensions + 1 fact table │
|
||||
│ SCD Type 2 on DIM_HOTEL │
|
||||
└───────────────────────┬─────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────┐
|
||||
│ PRESENTATION LAYER │
|
||||
│ Power BI Desktop │
|
||||
│ OLAP reports via DirectQuery / Import │
|
||||
└─────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Technology Stack
|
||||
|
||||
| Component | Technology | Version |
|
||||
|-----------|-----------|---------|
|
||||
| OLTP Database | MySQL | 8.4 |
|
||||
| Container runtime | Docker / Podman | — |
|
||||
| Data generator | C# (.NET) | 10 |
|
||||
| ETL tool | Apache NiFi | — |
|
||||
| Data Mart | Oracle RDBMS | university lab |
|
||||
| Reporting | Power BI Desktop | — |
|
||||
|
||||
---
|
||||
|
||||
## Repository Structure
|
||||
|
||||
```
|
||||
IPZ_1/
|
||||
├── docker/
|
||||
│ ├── start.sh # Start MySQL container (Linux/macOS)
|
||||
│ ├── stop.sh # Stop MySQL container
|
||||
│ ├── start.ps1 # Start MySQL container (Windows)
|
||||
│ └── stop.ps1 # Stop MySQL container
|
||||
├── sql/
|
||||
│ ├── schema.sql # MySQL OLTP DDL
|
||||
│ └── datamart_schema.sql # Oracle Data Mart DDL
|
||||
├── generator/
|
||||
│ └── generate.cs # .NET 10 data generator script
|
||||
└── docs/
|
||||
├── 01-overview.md # This file
|
||||
├── 02-oltp.md # OLTP database design
|
||||
├── 03-datamart.md # Data mart design
|
||||
├── 04-setup.md # Setup and run guide
|
||||
└── nifi-flow.md # NiFi ETL flow reference
|
||||
```
|
||||
258
docs/02-oltp.md
Normal file
258
docs/02-oltp.md
Normal file
@@ -0,0 +1,258 @@
|
||||
# OLTP Database — Design & Details
|
||||
|
||||
## Overview
|
||||
|
||||
The OLTP (Online Transaction Processing) database models a **hotel reservation system** using a fully normalized relational schema in **MySQL 8.4**. It follows 3NF and enforces referential integrity via foreign keys.
|
||||
|
||||
- **Database:** `hotel_reservations`
|
||||
- **Character set:** `utf8mb4` / `utf8mb4_unicode_ci`
|
||||
- **Tables:** 13
|
||||
- **Total rows:** ~635,000
|
||||
|
||||
---
|
||||
|
||||
## Entity-Relationship Model
|
||||
|
||||
The schema covers five entity groups:
|
||||
|
||||
```
|
||||
hotel_chain ──┐
|
||||
country ───────┼──► hotel ──► hotel_room ──► room_booking ──► booking ──► guest
|
||||
star_rating ──┘ │
|
||||
└──► country
|
||||
hotel_characteristic ◄──► hotel (M:N via hotel_hotel_characteristic)
|
||||
|
||||
room_type ◄──── hotel_room
|
||||
room_type ◄──┐
|
||||
rate_period ◄─┴── period_room_rate (price per room type per season)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Table Descriptions
|
||||
|
||||
### Reference / Lookup Tables
|
||||
|
||||
#### `hotel_chain`
|
||||
International hotel chains (Hilton, Marriott, Accor, etc.).
|
||||
|
||||
| Column | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `hotel_chain_id` | INT UNSIGNED PK | Surrogate key |
|
||||
| `code` | VARCHAR(10) UNIQUE | Short code (e.g. `HLT`) |
|
||||
| `name` | VARCHAR(100) | Full name |
|
||||
|
||||
**Rows:** 10
|
||||
|
||||
---
|
||||
|
||||
#### `country`
|
||||
Countries from which guests come and where hotels are located.
|
||||
|
||||
| Column | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `country_id` | INT UNSIGNED PK | Surrogate key |
|
||||
| `code` | CHAR(2) UNIQUE | ISO 3166-1 alpha-2 (e.g. `GB`) |
|
||||
| `name` | VARCHAR(100) | Country name |
|
||||
| `currency` | VARCHAR(10) | ISO currency code (e.g. `EUR`) |
|
||||
|
||||
**Rows:** 40 (Europe, Americas, Asia, Africa, Oceania)
|
||||
|
||||
---
|
||||
|
||||
#### `star_rating`
|
||||
Hotel classification from 1★ to 5★.
|
||||
|
||||
| Column | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `star_rating_id` | INT UNSIGNED PK | Surrogate key |
|
||||
| `code` | TINYINT UNIQUE | 1–5 |
|
||||
| `description` | VARCHAR(20) | e.g. `3 Star` |
|
||||
|
||||
**Rows:** 5
|
||||
|
||||
---
|
||||
|
||||
#### `hotel_characteristic`
|
||||
Amenities and features a hotel may offer.
|
||||
|
||||
| Column | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `characteristic_id` | INT UNSIGNED PK | Surrogate key |
|
||||
| `code` | VARCHAR(20) UNIQUE | e.g. `POOL`, `SPA`, `WIFI` |
|
||||
| `description` | VARCHAR(100) | Human-readable label |
|
||||
|
||||
**Rows:** 12 (WiFi, Pool, Gym, Spa, Restaurant, Bar, Parking, Valet, Conference, Shuttle, Room Service, Pet Friendly)
|
||||
|
||||
---
|
||||
|
||||
#### `room_type`
|
||||
Types of rooms a hotel can offer, with a standard (base) rate.
|
||||
|
||||
| Column | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `room_type_id` | INT UNSIGNED PK | Surrogate key |
|
||||
| `code` | VARCHAR(20) UNIQUE | e.g. `SINGLE`, `SUITE` |
|
||||
| `description` | VARCHAR(100) | e.g. `Junior Suite` |
|
||||
| `standard_rate` | DECIMAL(10,2) | Base nightly rate (EUR) |
|
||||
| `smoking_yn` | BOOLEAN | Smoking allowed flag |
|
||||
|
||||
**Rows:** 7 (Single €80, Double €120, Twin €115, Deluxe €180, Suite €280, Executive €450, Family €200)
|
||||
|
||||
---
|
||||
|
||||
#### `rate_period`
|
||||
Seasonal pricing periods. Each period maps to a month range and applies a rate multiplier.
|
||||
|
||||
| Column | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `rate_period_id` | INT UNSIGNED PK | Surrogate key |
|
||||
| `code` | VARCHAR(20) UNIQUE | e.g. `PEAK`, `WINTER` |
|
||||
| `description` | VARCHAR(50) | Human-readable label |
|
||||
| `month_from` | TINYINT | Start month (1–12) |
|
||||
| `month_to` | TINYINT | End month (1–12) |
|
||||
|
||||
**Rows:** 4
|
||||
|
||||
| Code | Period | Months | Multiplier |
|
||||
|------|--------|--------|-----------|
|
||||
| PEAK | Peak Season | Jun–Aug | ×1.5 |
|
||||
| HIGH | High Season | Mar–May | ×1.2 |
|
||||
| AUTUMN | Autumn Season | Sep–Nov | ×1.1 |
|
||||
| WINTER | Winter Season | Dec–Feb | ×0.9 |
|
||||
|
||||
---
|
||||
|
||||
### Junction Tables
|
||||
|
||||
#### `period_room_rate`
|
||||
The effective nightly rate for each (room_type, rate_period) combination.
|
||||
Rate = `standard_rate × season_multiplier`.
|
||||
|
||||
| Column | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `room_type_id` | INT UNSIGNED PK/FK | |
|
||||
| `rate_period_id` | INT UNSIGNED PK/FK | |
|
||||
| `rate` | DECIMAL(10,2) | Effective nightly rate |
|
||||
|
||||
**Rows:** 28 (7 room types × 4 seasons)
|
||||
|
||||
---
|
||||
|
||||
#### `hotel_hotel_characteristic`
|
||||
M:N junction between hotels and their amenities.
|
||||
|
||||
| Column | Type |
|
||||
|--------|------|
|
||||
| `hotel_id` | INT UNSIGNED PK/FK |
|
||||
| `characteristic_id` | INT UNSIGNED PK/FK |
|
||||
|
||||
**Rows:** ~1,415
|
||||
|
||||
---
|
||||
|
||||
### Core Entity Tables
|
||||
|
||||
#### `hotel`
|
||||
Individual hotel properties.
|
||||
|
||||
| Column | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `hotel_id` | INT UNSIGNED PK | |
|
||||
| `hotel_chain_id` | INT UNSIGNED FK | NULL for independent hotels |
|
||||
| `country_id` | INT UNSIGNED FK | |
|
||||
| `star_rating_id` | INT UNSIGNED FK | |
|
||||
| `code` | VARCHAR(20) UNIQUE | e.g. `HTL0001` |
|
||||
| `name` | VARCHAR(150) | |
|
||||
| `address` | VARCHAR(200) | |
|
||||
| `postcode` | VARCHAR(20) | |
|
||||
| `city` | VARCHAR(100) | |
|
||||
| `url` | VARCHAR(200) | |
|
||||
|
||||
**Rows:** 200 (50 cities, star distribution: 5% 1★, 10% 2★, 40% 3★, 30% 4★, 15% 5★)
|
||||
|
||||
---
|
||||
|
||||
#### `hotel_room`
|
||||
Individual rooms within each hotel.
|
||||
|
||||
| Column | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `room_id` | INT UNSIGNED PK | |
|
||||
| `hotel_id` | INT UNSIGNED FK | |
|
||||
| `room_type_id` | INT UNSIGNED FK | |
|
||||
| `room_number` | VARCHAR(10) | Format: `{floor}{number}`, e.g. `101` |
|
||||
| `floor` | TINYINT UNSIGNED | |
|
||||
|
||||
**Rows:** 5,334 (5–60 rooms per hotel depending on star rating)
|
||||
|
||||
---
|
||||
|
||||
#### `guest`
|
||||
Hotel guests.
|
||||
|
||||
| Column | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `guest_id` | INT UNSIGNED PK | |
|
||||
| `country_id` | INT UNSIGNED FK | Guest's home country |
|
||||
| `name` | VARCHAR(150) | Full name |
|
||||
| `email` | VARCHAR(150) | Unique synthetic email |
|
||||
| `address` | VARCHAR(200) | |
|
||||
| `city` | VARCHAR(100) | |
|
||||
|
||||
**Rows:** 100,000
|
||||
|
||||
---
|
||||
|
||||
#### `booking`
|
||||
A reservation made by a guest at a hotel. One booking can cover multiple rooms.
|
||||
|
||||
| Column | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `booking_id` | INT UNSIGNED PK | |
|
||||
| `guest_id` | INT UNSIGNED FK | |
|
||||
| `hotel_id` | INT UNSIGNED FK | |
|
||||
| `date_from` | DATE | Check-in |
|
||||
| `date_to` | DATE | Check-out |
|
||||
| `status` | ENUM | `confirmed`, `cancelled`, `completed`, `no_show` |
|
||||
| `created_at` | DATETIME | When booking was made |
|
||||
|
||||
**Rows:** 500,000
|
||||
**Status distribution:** 80% completed, 10% confirmed, 7% cancelled, 3% no_show
|
||||
**Date range:** 2022-01-01 – 2025-12-31
|
||||
**Seasonal distribution:** June–August heaviest (peak), December–February lightest
|
||||
|
||||
---
|
||||
|
||||
#### `room_booking`
|
||||
A specific room assigned within a booking. Stores the rate **as it was at booking time** (snapshot), independent of any future rate changes.
|
||||
|
||||
| Column | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `room_booking_id` | INT UNSIGNED PK | |
|
||||
| `booking_id` | INT UNSIGNED FK | |
|
||||
| `room_id` | INT UNSIGNED FK | |
|
||||
| `date_from` | DATE | |
|
||||
| `date_to` | DATE | |
|
||||
| `nightly_rate` | DECIMAL(10,2) | Rate at time of booking |
|
||||
| `total_amount` | DECIMAL(10,2) | `nightly_rate × nights` |
|
||||
|
||||
**Rows:** 531,382
|
||||
**Room count per booking:** 90% single room, 8% two rooms, 2% three rooms
|
||||
|
||||
---
|
||||
|
||||
## Data Generation
|
||||
|
||||
The database was populated using a **single-file C# script** (`generator/generate.cs`) running on .NET 10, using `MySqlConnector` as the only dependency.
|
||||
|
||||
Key generation decisions:
|
||||
- **Seasonal booking distribution** via rejection sampling — months Jun–Aug are ~2.7× more likely than Jan–Feb
|
||||
- **Rate snapshot** — each `room_booking.nightly_rate` is looked up from `period_room_rate` at insert time and stored, not re-computed later
|
||||
- **Realistic stay lengths** — 30% one night, 25% two nights, 20% three nights, tapering off to 14-night stays
|
||||
- **Cancelled/no-show bookings** partially skip room assignment (60% of cancellations have no room_booking)
|
||||
|
||||
```bash
|
||||
# Run generator
|
||||
dotnet run generator/generate.cs
|
||||
```
|
||||
255
docs/03-datamart.md
Normal file
255
docs/03-datamart.md
Normal file
@@ -0,0 +1,255 @@
|
||||
# Data Mart — Design & Details
|
||||
|
||||
## Overview
|
||||
|
||||
The data mart uses a **star schema** stored in an Oracle database (university lab schema). It is optimized for analytical queries against hotel reservation data — revenue analysis, occupancy trends, seasonal patterns, and guest origin breakdowns.
|
||||
|
||||
- **Schema type:** Star schema
|
||||
- **Dimensions:** 6 (+ date dimension)
|
||||
- **Fact table:** `FACT_ROOM_BOOKING`
|
||||
- **Grain:** One row per room_booking (one room, one stay)
|
||||
- **SCD strategy:** Type 2 on DIM_HOTEL, Type 1 on all others
|
||||
|
||||
---
|
||||
|
||||
## Star Schema Diagram
|
||||
|
||||
```
|
||||
DIM_DATE
|
||||
(date_key)
|
||||
│
|
||||
┌───────────┴───────────┐
|
||||
│ checkin / checkout │
|
||||
│ │
|
||||
DIM_HOTEL_CHAIN ◄─ DIM_HOTEL ─► DIM_STAR_RATING
|
||||
│ │
|
||||
│ FACT_ROOM_BOOKING ◄──── DIM_ROOM
|
||||
│ │
|
||||
└───────► DIM_COUNTRY ◄───── DIM_GUEST
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Dimension Tables
|
||||
|
||||
### DIM_DATE
|
||||
Populated once for the range 2020–2030. Used for both check-in and check-out date lookups.
|
||||
|
||||
| Column | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `date_key` | NUMBER(8) PK | YYYYMMDD integer key |
|
||||
| `full_date` | DATE | Actual date value |
|
||||
| `year` | NUMBER(4) | |
|
||||
| `quarter` | NUMBER(1) | 1–4 |
|
||||
| `month` | NUMBER(2) | 1–12 |
|
||||
| `month_name` | VARCHAR2(10) | e.g. `January` |
|
||||
| `week_number` | NUMBER(2) | ISO week number |
|
||||
| `day_of_month` | NUMBER(2) | |
|
||||
| `day_name` | VARCHAR2(10) | e.g. `Monday` |
|
||||
| `is_weekend` | NUMBER(1) | 0/1 |
|
||||
| `is_business_day` | NUMBER(1) | 0/1 |
|
||||
| `season` | VARCHAR2(10) | Peak / High / Autumn / Winter |
|
||||
|
||||
Using an integer date key (YYYYMMDD) instead of a DATE FK allows efficient range predicates: `checkin_date_key BETWEEN 20240601 AND 20240831`.
|
||||
|
||||
---
|
||||
|
||||
### DIM_COUNTRY (SCD Type 1)
|
||||
Country attributes are stable. If a name or currency ever changes, the row is simply overwritten (no history needed).
|
||||
|
||||
| Column | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `country_key` | NUMBER(10) PK | Surrogate (IDENTITY) |
|
||||
| `country_id` | NUMBER(10) UNIQUE | Natural key from MySQL |
|
||||
| `code` | CHAR(2) | ISO alpha-2 |
|
||||
| `name` | VARCHAR2(100) | |
|
||||
| `currency` | VARCHAR2(10) | ISO currency code |
|
||||
|
||||
---
|
||||
|
||||
### DIM_STAR_RATING (SCD Type 1)
|
||||
Static lookup. Star rating codes 1–5 never change.
|
||||
|
||||
| Column | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `star_rating_key` | NUMBER(10) PK | Surrogate (IDENTITY) |
|
||||
| `star_rating_id` | NUMBER(10) UNIQUE | Natural key |
|
||||
| `code` | NUMBER(1) | 1–5 |
|
||||
| `description` | VARCHAR2(20) | e.g. `4 Star` |
|
||||
|
||||
---
|
||||
|
||||
### DIM_HOTEL_CHAIN (SCD Type 1)
|
||||
Chain name/code may be updated (e.g. corporate rebranding), but we do not need a historical record of chain name changes.
|
||||
|
||||
| Column | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `hotel_chain_key` | NUMBER(10) PK | Surrogate (IDENTITY) |
|
||||
| `hotel_chain_id` | NUMBER(10) UNIQUE | Natural key |
|
||||
| `code` | VARCHAR2(10) | e.g. `HLT` |
|
||||
| `name` | VARCHAR2(100) | |
|
||||
|
||||
---
|
||||
|
||||
### DIM_HOTEL (SCD Type 2)
|
||||
|
||||
This is the most analytically significant dimension and the only one implemented as **Slowly Changing Dimension Type 2**.
|
||||
|
||||
**Why SCD Type 2 here?**
|
||||
|
||||
A hotel's star rating or chain affiliation can change over time — a property gets renovated and reclassified from 3★ to 4★, or switches from one international chain to another. These changes directly affect revenue analysis: a 3★ hotel charges different rates than a 4★ hotel, and grouping all historical bookings under the current star rating would produce misleading averages.
|
||||
|
||||
SCD Type 2 preserves history by creating a **new row** for each version of a hotel, while expiring the old row with an `expiry_date`. The fact table's `hotel_key` always points to the version that was active **at check-in date**, never to the current version if it changed.
|
||||
|
||||
| Column | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `hotel_key` | NUMBER(10) PK | Surrogate (IDENTITY) |
|
||||
| `source_hotel_id` | NUMBER(10) | Natural key from MySQL |
|
||||
| `hotel_chain_key` | NUMBER(10) FK | NULL for independent hotels |
|
||||
| `country_key` | NUMBER(10) FK | |
|
||||
| `star_rating_key` | NUMBER(10) FK | |
|
||||
| `code` | VARCHAR2(20) | |
|
||||
| `name` | VARCHAR2(150) | |
|
||||
| `city` | VARCHAR2(100) | |
|
||||
| `effective_date` | DATE | When this version became active |
|
||||
| `expiry_date` | DATE | When this version was superseded (NULL = current) |
|
||||
| `is_current` | NUMBER(1) | 1 = current version |
|
||||
|
||||
**SCD2 example:**
|
||||
|
||||
| hotel_key | source_hotel_id | star_rating | effective_date | expiry_date | is_current |
|
||||
|-----------|----------------|-------------|----------------|-------------|-----------|
|
||||
| 1 | 42 | 3★ | 2022-01-01 | 2024-05-31 | 0 |
|
||||
| 2 | 42 | 4★ | 2024-06-01 | NULL | 1 |
|
||||
|
||||
Bookings from 2022–2024 point to `hotel_key=1`, bookings from 2024 onward point to `hotel_key=2`. Revenue by star category remains historically correct.
|
||||
|
||||
---
|
||||
|
||||
### DIM_ROOM (SCD Type 1)
|
||||
Room type is stable for our dataset. Updated via MERGE if room details ever change.
|
||||
|
||||
| Column | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `room_key` | NUMBER(10) PK | Surrogate (IDENTITY) |
|
||||
| `room_id` | NUMBER(10) UNIQUE | Natural key |
|
||||
| `hotel_key` | NUMBER(10) FK | Points to current DIM_HOTEL version |
|
||||
| `room_number` | VARCHAR2(10) | |
|
||||
| `floor` | NUMBER(3) | |
|
||||
| `room_type_code` | VARCHAR2(20) | e.g. `SUITE` |
|
||||
| `room_type_desc` | VARCHAR2(100) | |
|
||||
| `smoking_yn` | NUMBER(1) | |
|
||||
| `standard_rate` | NUMBER(10,2) | Base rate from OLTP |
|
||||
|
||||
---
|
||||
|
||||
### DIM_GUEST (SCD Type 1)
|
||||
Guest personal data (city, country) may change, but tracking historical addresses has no analytical value for this domain. MERGE (upsert) is used.
|
||||
|
||||
| Column | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `guest_key` | NUMBER(10) PK | Surrogate (IDENTITY) |
|
||||
| `guest_id` | NUMBER(10) UNIQUE | Natural key |
|
||||
| `country_key` | NUMBER(10) FK | Home country |
|
||||
| `name` | VARCHAR2(150) | |
|
||||
| `city` | VARCHAR2(100) | |
|
||||
|
||||
---
|
||||
|
||||
## Fact Table: FACT_ROOM_BOOKING
|
||||
|
||||
**Grain:** One row per room_booking — one specific room, for one stay.
|
||||
|
||||
| Column | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `fact_id` | NUMBER(10) PK | Surrogate (IDENTITY) |
|
||||
| `source_rb_id` | NUMBER(10) UNIQUE | Natural key — used for idempotent incremental loads |
|
||||
| `hotel_key` | NUMBER(10) FK | SCD2-resolved hotel version at check-in |
|
||||
| `hotel_chain_key` | NUMBER(10) FK | Denormalized from DIM_HOTEL for convenience |
|
||||
| `room_key` | NUMBER(10) FK | |
|
||||
| `guest_key` | NUMBER(10) FK | |
|
||||
| `country_key` | NUMBER(10) FK | Guest's country — denormalized |
|
||||
| `star_rating_key` | NUMBER(10) FK | Denormalized from DIM_HOTEL for convenience |
|
||||
| `checkin_date_key` | NUMBER(8) FK | YYYYMMDD |
|
||||
| `checkout_date_key` | NUMBER(8) FK | YYYYMMDD |
|
||||
| `booking_status` | VARCHAR2(20) | Degenerate dimension: confirmed/completed/cancelled/no_show |
|
||||
| `nights_stayed` | NUMBER(4) | checkout − checkin in days |
|
||||
| `nightly_rate` | NUMBER(10,2) | Rate per night at time of booking |
|
||||
| `total_amount` | NUMBER(12,2) | `nightly_rate × nights_stayed` |
|
||||
|
||||
### Measures
|
||||
|
||||
| Measure | Type | Aggregation |
|
||||
|---------|------|-------------|
|
||||
| `nights_stayed` | Additive | SUM, AVG |
|
||||
| `nightly_rate` | Semi-additive | AVG (not SUM — rate doesn't add across rooms meaningfully) |
|
||||
| `total_amount` | Additive | SUM (main revenue measure) |
|
||||
|
||||
### Degenerate Dimensions
|
||||
`booking_status` is stored directly on the fact row. Splitting it into a separate dimension table would add a table with only 4 rows and no other attributes — not worth the JOIN overhead.
|
||||
|
||||
---
|
||||
|
||||
## ETL Control Tables
|
||||
|
||||
### ETL_WATERMARK
|
||||
Tracks the highest `room_booking_id` already loaded into the fact table, enabling incremental loads without re-reading the entire source.
|
||||
|
||||
| Column | Description |
|
||||
|--------|-------------|
|
||||
| `entity_name` | Logical entity name (e.g. `FACT_ROOM_BOOKING`) |
|
||||
| `last_key` | Highest PK value loaded so far |
|
||||
| `last_run_ts` | Timestamp of the last ETL run |
|
||||
|
||||
### STG_HOTEL
|
||||
Staging table used by the SCD2 ETL process. NiFi loads raw hotel data from MySQL here, then SQL applies the expire-and-insert SCD2 logic in a single transaction. Truncated at the start of each ETL run.
|
||||
|
||||
---
|
||||
|
||||
## Sample Analytical Queries
|
||||
|
||||
### Revenue by country and quarter
|
||||
```sql
|
||||
SELECT
|
||||
c.name AS country,
|
||||
d.year,
|
||||
d.quarter,
|
||||
SUM(f.total_amount) AS revenue,
|
||||
COUNT(*) AS room_nights
|
||||
FROM FACT_ROOM_BOOKING f
|
||||
JOIN DIM_DATE d ON d.date_key = f.checkin_date_key
|
||||
JOIN DIM_GUEST g ON g.guest_key = f.guest_key
|
||||
JOIN DIM_COUNTRY c ON c.country_key = g.country_key
|
||||
WHERE f.booking_status = 'completed'
|
||||
GROUP BY c.name, d.year, d.quarter
|
||||
ORDER BY revenue DESC;
|
||||
```
|
||||
|
||||
### Average revenue per star category (correct because of SCD2)
|
||||
```sql
|
||||
SELECT
|
||||
sr.code AS stars,
|
||||
d.season,
|
||||
AVG(f.nightly_rate) AS avg_nightly_rate,
|
||||
SUM(f.total_amount) AS total_revenue
|
||||
FROM FACT_ROOM_BOOKING f
|
||||
JOIN DIM_HOTEL h ON h.hotel_key = f.hotel_key
|
||||
JOIN DIM_STAR_RATING sr ON sr.star_rating_key = f.star_rating_key
|
||||
JOIN DIM_DATE d ON d.date_key = f.checkin_date_key
|
||||
GROUP BY sr.code, d.season
|
||||
ORDER BY sr.code, d.season;
|
||||
```
|
||||
|
||||
### Top 10 cities by occupancy (room-nights)
|
||||
```sql
|
||||
SELECT
|
||||
h.city,
|
||||
SUM(f.nights_stayed) AS room_nights,
|
||||
SUM(f.total_amount) AS revenue
|
||||
FROM FACT_ROOM_BOOKING f
|
||||
JOIN DIM_HOTEL h ON h.hotel_key = f.hotel_key
|
||||
WHERE f.booking_status IN ('completed','confirmed')
|
||||
GROUP BY h.city
|
||||
ORDER BY room_nights DESC
|
||||
FETCH FIRST 10 ROWS ONLY;
|
||||
```
|
||||
181
docs/04-setup.md
Normal file
181
docs/04-setup.md
Normal file
@@ -0,0 +1,181 @@
|
||||
# Setup Guide
|
||||
|
||||
## Prerequisites
|
||||
|
||||
| Tool | Required for | Notes |
|
||||
|------|-------------|-------|
|
||||
| Docker or Podman | MySQL container | Use `--podman` flag on Linux |
|
||||
| .NET 10 SDK | Data generator | `dotnet run file.cs` support |
|
||||
| Apache NiFi | ETL | Running instance with Oracle + MySQL JDBC drivers |
|
||||
| Oracle JDBC driver | NiFi | `ojdbc11.jar` in NiFi's lib directory |
|
||||
| MySQL JDBC driver | NiFi | `mysql-connector-j-*.jar` in NiFi's lib directory |
|
||||
| Oracle DB access | Data mart target | University lab credentials |
|
||||
|
||||
---
|
||||
|
||||
## Step 1 — Start MySQL Container
|
||||
|
||||
**Linux / macOS (Docker):**
|
||||
```bash
|
||||
bash docker/start.sh
|
||||
```
|
||||
|
||||
**Linux / macOS (Podman):**
|
||||
```bash
|
||||
bash docker/start.sh --podman
|
||||
```
|
||||
|
||||
**Windows (PowerShell):**
|
||||
```powershell
|
||||
.\docker\start.ps1
|
||||
```
|
||||
|
||||
The script:
|
||||
- Creates a named container `hotel-mysql` with a persistent data volume
|
||||
- Mounts `sql/schema.sql` as an init script — all 13 tables are created automatically on first start
|
||||
- Waits until MySQL is ready before exiting
|
||||
|
||||
**Connection details:**
|
||||
```
|
||||
Host: 127.0.0.1
|
||||
Port: 13306
|
||||
Database: hotel_reservations
|
||||
User: root
|
||||
Password: hotel2025root
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 2 — Generate OLTP Data
|
||||
|
||||
```bash
|
||||
dotnet run generator/generate.cs
|
||||
```
|
||||
|
||||
**Runtime:** ~3 minutes
|
||||
**Output:** 635,000+ rows across 13 tables
|
||||
|
||||
The generator is deterministic (fixed seed `42`) — running it twice on an empty database produces the same data.
|
||||
|
||||
> **Important:** Run the generator only once on an empty database. If you need to restart, truncate all tables first (respecting FK order) or drop and recreate the container + volume.
|
||||
|
||||
### Quick table verification after generation:
|
||||
```bash
|
||||
# Docker
|
||||
docker exec hotel-mysql mysql -uroot -photel2025root hotel_reservations \
|
||||
-e "SELECT table_name, table_rows FROM information_schema.tables WHERE table_schema='hotel_reservations';"
|
||||
|
||||
# Podman
|
||||
podman exec hotel-mysql mysql -uroot -photel2025root hotel_reservations \
|
||||
-e "SELECT table_name, table_rows FROM information_schema.tables WHERE table_schema='hotel_reservations';"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 3 — Prepare Oracle Data Mart
|
||||
|
||||
Connect to the Oracle schema (university lab) and execute `sql/datamart_schema.sql`.
|
||||
|
||||
The script creates:
|
||||
- `ETL_WATERMARK` (with initial row for `FACT_ROOM_BOOKING`)
|
||||
- `STG_HOTEL` (staging)
|
||||
- All 7 dimension tables
|
||||
- `FACT_ROOM_BOOKING`
|
||||
|
||||
```sql
|
||||
-- Run in SQL*Plus or SQL Developer:
|
||||
@datamart_schema.sql
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 4 — Configure NiFi
|
||||
|
||||
### 4.1 Add JDBC drivers to NiFi
|
||||
|
||||
Copy the following JARs to `$NIFI_HOME/lib/` (or the NiFi extensions directory):
|
||||
- `mysql-connector-j-8.x.jar`
|
||||
- `ojdbc11.jar`
|
||||
|
||||
Restart NiFi after adding drivers.
|
||||
|
||||
### 4.2 Create Controller Services
|
||||
|
||||
In NiFi UI → Controller Settings → Controller Services:
|
||||
|
||||
**MySQL connection:**
|
||||
- Type: `DBCPConnectionPool`
|
||||
- Database Driver Class Name: `com.mysql.cj.jdbc.Driver`
|
||||
- Database Connection URL: `jdbc:mysql://127.0.0.1:13306/hotel_reservations`
|
||||
- Database User: `root`
|
||||
- Password: `hotel2025root`
|
||||
|
||||
**Oracle connection:**
|
||||
- Type: `DBCPConnectionPool`
|
||||
- Database Driver Class Name: `oracle.jdbc.OracleDriver`
|
||||
- Database Connection URL: `jdbc:oracle:thin:@<host>:1521:<sid>`
|
||||
- Database User: `<your_schema>`
|
||||
- Password: `<your_password>`
|
||||
|
||||
Enable both services.
|
||||
|
||||
### 4.3 Build Process Groups
|
||||
|
||||
Follow the detailed processor configuration in `docs/nifi-flow.md`.
|
||||
|
||||
**Recommended build order:**
|
||||
1. PG-1: Date Dimension (simplest, test first)
|
||||
2. PG-2: Static Dimensions (verify MERGE logic)
|
||||
3. PG-3: DIM_HOTEL SCD2 (most complex — check staging table after run)
|
||||
4. PG-4: DIM_GUEST SCD1
|
||||
5. PG-5: Fact Incremental Load
|
||||
|
||||
---
|
||||
|
||||
## Step 5 — Run ETL
|
||||
|
||||
### First full load
|
||||
|
||||
1. Run **PG-1** (Date Dimension) manually — run once
|
||||
2. Start **PG-2, PG-3, PG-4** — these are idempotent, safe to re-run
|
||||
3. Start **PG-5** — runs incrementally; first run loads all 531k room_bookings
|
||||
|
||||
### Verify load
|
||||
|
||||
```sql
|
||||
-- Oracle
|
||||
SELECT COUNT(*) FROM DIM_HOTEL; -- should be 200 (+ more after SCD2 changes)
|
||||
SELECT COUNT(*) FROM DIM_GUEST; -- 100,000
|
||||
SELECT COUNT(*) FROM FACT_ROOM_BOOKING; -- 531,382
|
||||
SELECT last_key FROM ETL_WATERMARK WHERE entity_name = 'FACT_ROOM_BOOKING'; -- 531,382
|
||||
```
|
||||
|
||||
### Verify SCD2 is working
|
||||
|
||||
```sql
|
||||
-- Should show 1 current version per hotel on initial load
|
||||
SELECT is_current, COUNT(*) FROM DIM_HOTEL GROUP BY is_current;
|
||||
-- Expected: IS_CURRENT=1, COUNT=200
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Stop / Restart
|
||||
|
||||
**Stop MySQL (preserves data):**
|
||||
```bash
|
||||
bash docker/stop.sh [--podman]
|
||||
```
|
||||
|
||||
**Restart MySQL:**
|
||||
```bash
|
||||
bash docker/start.sh [--podman]
|
||||
```
|
||||
|
||||
**Full reset (delete all data):**
|
||||
```bash
|
||||
bash docker/stop.sh --podman
|
||||
podman volume rm hotel-mysql-data
|
||||
bash docker/start.sh --podman
|
||||
dotnet run generator/generate.cs
|
||||
```
|
||||
83
docs/05-conclusion.md
Normal file
83
docs/05-conclusion.md
Normal file
@@ -0,0 +1,83 @@
|
||||
# Conclusion
|
||||
|
||||
## What Was Built
|
||||
|
||||
This project delivers a complete, working **Data Warehouse pipeline** for the Hotel Reservations domain:
|
||||
|
||||
| Layer | What was built | Scale |
|
||||
|-------|---------------|-------|
|
||||
| OLTP | MySQL 8.4, 13-table normalized schema | ~635,000 rows |
|
||||
| Data generation | .NET 10 C# script, realistic seasonal distribution | 500K bookings in ~3 min |
|
||||
| ETL | Apache NiFi, 5 process groups | full + incremental loads |
|
||||
| Data Mart | Oracle star schema, SCD Type 2 on DIM_HOTEL | 1 fact + 6 dims |
|
||||
|
||||
---
|
||||
|
||||
## Design Decisions
|
||||
|
||||
### Synthetic data generation instead of a Kaggle dataset
|
||||
|
||||
The decision to generate data rather than use a pre-existing dataset was deliberate. Publicly available hotel datasets are either too small (thousands of rows) or lack the normalized relational structure needed to demonstrate a realistic OLTP-to-DW pipeline. The generator produces statistically realistic data:
|
||||
|
||||
- Seasonal booking distribution (summer peak, winter trough)
|
||||
- Realistic stay-length distribution (30% one-night stays)
|
||||
- Varied status distribution (80% completed, 10% confirmed, 7% cancelled, 3% no-show)
|
||||
- Revenue rates tied to actual seasonal pricing periods
|
||||
|
||||
### SCD Type 2 on DIM_HOTEL only
|
||||
|
||||
SCD Type 2 adds operational complexity — it requires staging tables, a two-phase SQL update, and SCD2-aware fact inserts. Applying it to every dimension would make the ETL unnecessarily complex for the analytical benefit gained.
|
||||
|
||||
DIM_HOTEL is the right candidate because:
|
||||
- Star rating changes (3★→4★ after renovation) directly affect revenue benchmarks
|
||||
- Chain affiliation changes (hotel joins or leaves a franchise) affect chain-level reporting
|
||||
- Tracking these historically is the core value proposition of dimensional modelling
|
||||
|
||||
Guests, countries, room types, and hotel chains all change rarely or in ways that don't affect historical analysis — SCD Type 1 (overwrite) is appropriate.
|
||||
|
||||
### Watermark-based incremental fact loading
|
||||
|
||||
The fact table uses `source_rb_id` (the MySQL `room_booking_id`) as a natural key and applies a `NOT EXISTS` guard on every insert. Combined with the `ETL_WATERMARK` table, this makes PG-5 both **incremental** (only processes new rows) and **idempotent** (safe to re-run without creating duplicates). This pattern is production-standard and would scale cleanly to a real operational system.
|
||||
|
||||
### Integer date keys in DIM_DATE
|
||||
|
||||
`date_key` is stored as `NUMBER(8)` in YYYYMMDD format rather than a FK to a DATE column. This allows:
|
||||
- Fast range predicates: `WHERE checkin_date_key BETWEEN 20240601 AND 20240831`
|
||||
- No JOIN to get the date value when it's used directly in GROUP BY
|
||||
- Human-readable values in query results without formatting
|
||||
|
||||
---
|
||||
|
||||
## Analytical Capabilities
|
||||
|
||||
The data mart enables the following categories of OLAP queries:
|
||||
|
||||
**Revenue analysis:**
|
||||
- Total revenue by country, city, hotel chain, star category
|
||||
- Revenue trend over time (monthly, quarterly, yearly)
|
||||
- Revenue split by booking status and room type
|
||||
|
||||
**Occupancy analysis:**
|
||||
- Room-nights sold per hotel, per season
|
||||
- Average stay duration by guest country
|
||||
- Cancellation rates by period and hotel category
|
||||
|
||||
**SCD2-specific analysis:**
|
||||
- Compare revenue performance of hotels before and after star rating upgrade
|
||||
- Identify which hotel version (chain affiliation) was more profitable
|
||||
|
||||
**Guest origin analysis:**
|
||||
- Which countries generate the most bookings and revenue
|
||||
- Cross-country booking patterns (guest country vs hotel country)
|
||||
|
||||
---
|
||||
|
||||
## Limitations and Possible Extensions
|
||||
|
||||
| Limitation | Possible extension |
|
||||
|------------|-------------------|
|
||||
| Static OLTP data (no live updates) | Add a NiFi timer to simulate ongoing bookings |
|
||||
| No SCD2 on DIM_ROOM | Add room type tracking for renovation analysis |
|
||||
| Single fact table | Add a second fact table for daily hotel occupancy (snapshot fact) |
|
||||
| No data quality checks in NiFi | Add RouteOnAttribute + dead-letter queue for failed records |
|
||||
| Oracle target is university lab | Package with Oracle XE Docker container for self-contained demo |
|
||||
451
docs/nifi-flow.md
Normal file
451
docs/nifi-flow.md
Normal file
@@ -0,0 +1,451 @@
|
||||
# NiFi ETL Flow — Hotel Reservations Data Mart
|
||||
|
||||
## Overview
|
||||
|
||||
The flow moves data from **MySQL 8.4 OLTP** (source) into **Oracle Data Mart** (target).
|
||||
It is organized into **5 Process Groups** that run in sequence, controlled by a top-level scheduler.
|
||||
|
||||
```
|
||||
[PG-1: Date Dim] → [PG-2: Static Dims] → [PG-3: SCD2 Hotel Dim] → [PG-4: SCD1 Guest] → [PG-5: Fact (incremental)]
|
||||
```
|
||||
|
||||
Each PG has a single **Input Port** and **Output Port** so the orchestrator can chain them with connections.
|
||||
|
||||
---
|
||||
|
||||
## Controller Services (shared by all PGs)
|
||||
|
||||
| Name | Type | Config |
|
||||
|------|------|--------|
|
||||
| `MySQL_DBCPService` | DBCPConnectionPool | Driver: `com.mysql.cj.jdbc.Driver`; URL: `jdbc:mysql://127.0.0.1:13306/hotel_reservations`; User: `root`; Pwd: `hotel2025root` |
|
||||
| `Oracle_DBCPService` | DBCPConnectionPool | Driver: `oracle.jdbc.OracleDriver`; URL: `jdbc:oracle:thin:@<host>:1521:<sid>`; User: `<schema>`; Pwd: `<pwd>` |
|
||||
| `JsonReader` | JsonTreeReader | default settings |
|
||||
| `JsonWriter` | JsonRecordSetWriter | default settings |
|
||||
| `AvroReader` | AvroReader | default settings |
|
||||
|
||||
---
|
||||
|
||||
## PG-1: Load Date Dimension
|
||||
|
||||
**Runs once** (or when extending the date range). Populates `DIM_DATE` for 2020–2030.
|
||||
|
||||
```
|
||||
GenerateFlowFile → ExecuteScript → SplitJson → EvaluateJsonPath → PutSQL
|
||||
```
|
||||
|
||||
### Processors
|
||||
|
||||
**GenerateFlowFile**
|
||||
- Run Schedule: manual (run once via right-click → Run Once)
|
||||
- Custom Text: `{}`
|
||||
|
||||
**ExecuteScript** (Groovy)
|
||||
```groovy
|
||||
import groovy.json.JsonOutput
|
||||
import java.time.*
|
||||
|
||||
def rows = []
|
||||
def d = LocalDate.of(2020, 1, 1)
|
||||
def end = LocalDate.of(2030, 12, 31)
|
||||
while (!d.isAfter(end)) {
|
||||
def m = d.monthValue
|
||||
def season = (m >= 6 && m <= 8) ? 'Peak'
|
||||
: (m >= 3 && m <= 5) ? 'High'
|
||||
: (m >= 9 && m <= 11) ? 'Autumn'
|
||||
: 'Winter'
|
||||
rows << [
|
||||
date_key: d.format(java.time.format.DateTimeFormatter.ofPattern('yyyyMMdd')) as int,
|
||||
full_date: d.toString(),
|
||||
year: d.year,
|
||||
quarter: ((m - 1) / 3 + 1) as int,
|
||||
month: m,
|
||||
month_name: d.month.toString().capitalize(),
|
||||
week_number: d.get(java.time.temporal.WeekFields.ISO.weekOfYear()),
|
||||
day_of_month: d.dayOfMonth,
|
||||
day_name: d.dayOfWeek.toString().capitalize(),
|
||||
is_weekend: (d.dayOfWeek.value >= 6) ? 1 : 0,
|
||||
is_business_day: (d.dayOfWeek.value <= 5) ? 1 : 0,
|
||||
season: season
|
||||
]
|
||||
d = d.plusDays(1)
|
||||
}
|
||||
def ff = session.create()
|
||||
ff = session.write(ff, { out -> out.write(JsonOutput.toJson(rows).bytes) } as OutputStreamCallback)
|
||||
ff = session.putAttribute(ff, 'mime.type', 'application/json')
|
||||
session.transfer(ff, REL_SUCCESS)
|
||||
```
|
||||
|
||||
**SplitJson**
|
||||
- JsonPath Expression: `$.*`
|
||||
|
||||
**EvaluateJsonPath**
|
||||
- Destination: `flowfile-attribute`
|
||||
- Attributes: `date_key`, `full_date`, `year`, `quarter`, `month`, `month_name`, `week_number`, `day_of_month`, `day_name`, `is_weekend`, `is_business_day`, `season`
|
||||
|
||||
**PutSQL**
|
||||
- JDBC Connection Pool: `Oracle_DBCPService`
|
||||
- SQL Statement:
|
||||
```sql
|
||||
INSERT INTO DIM_DATE (date_key, full_date, year, quarter, month, month_name,
|
||||
week_number, day_of_month, day_name, is_weekend, is_business_day, season)
|
||||
VALUES (${date_key}, TO_DATE('${full_date}','YYYY-MM-DD'), ${year}, ${quarter},
|
||||
${month}, '${month_name}', ${week_number}, ${day_of_month},
|
||||
'${day_name}', ${is_weekend}, ${is_business_day}, '${season}')
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## PG-2: Static Dimensions (SCD Type 1)
|
||||
|
||||
Loads `DIM_COUNTRY`, `DIM_STAR_RATING`, `DIM_HOTEL_CHAIN` from MySQL.
|
||||
Uses **MERGE INTO** so the flow is idempotent — re-running it updates changed rows and inserts new ones.
|
||||
|
||||
Each sub-flow follows the same pattern:
|
||||
|
||||
```
|
||||
ExecuteSQL(MySQL) → ConvertAvroToJSON → SplitJson → EvaluateJsonPath → PutSQL(MERGE)
|
||||
```
|
||||
|
||||
### 2a — DIM_COUNTRY
|
||||
|
||||
**ExecuteSQL** — Connection Pool: `MySQL_DBCPService`
|
||||
```sql
|
||||
SELECT country_id, code, name, currency FROM country ORDER BY country_id
|
||||
```
|
||||
|
||||
**EvaluateJsonPath** attributes: `country_id`, `code`, `name`, `currency`
|
||||
|
||||
**PutSQL**
|
||||
```sql
|
||||
MERGE INTO DIM_COUNTRY tgt
|
||||
USING (SELECT 1 FROM DUAL) src ON (tgt.country_id = ${country_id})
|
||||
WHEN MATCHED THEN
|
||||
UPDATE SET tgt.code = '${code}', tgt.name = '${name}', tgt.currency = '${currency}'
|
||||
WHEN NOT MATCHED THEN
|
||||
INSERT (country_id, code, name, currency)
|
||||
VALUES (${country_id}, '${code}', '${name}', '${currency}')
|
||||
```
|
||||
|
||||
### 2b — DIM_STAR_RATING
|
||||
|
||||
**ExecuteSQL**
|
||||
```sql
|
||||
SELECT star_rating_id, code, description FROM star_rating ORDER BY code
|
||||
```
|
||||
|
||||
**PutSQL**
|
||||
```sql
|
||||
MERGE INTO DIM_STAR_RATING tgt
|
||||
USING (SELECT 1 FROM DUAL) src ON (tgt.star_rating_id = ${star_rating_id})
|
||||
WHEN MATCHED THEN
|
||||
UPDATE SET tgt.code = ${code}, tgt.description = '${description}'
|
||||
WHEN NOT MATCHED THEN
|
||||
INSERT (star_rating_id, code, description)
|
||||
VALUES (${star_rating_id}, ${code}, '${description}')
|
||||
```
|
||||
|
||||
### 2c — DIM_HOTEL_CHAIN
|
||||
|
||||
**ExecuteSQL**
|
||||
```sql
|
||||
SELECT hotel_chain_id, code, name FROM hotel_chain ORDER BY hotel_chain_id
|
||||
```
|
||||
|
||||
**PutSQL**
|
||||
```sql
|
||||
MERGE INTO DIM_HOTEL_CHAIN tgt
|
||||
USING (SELECT 1 FROM DUAL) src ON (tgt.hotel_chain_id = ${hotel_chain_id})
|
||||
WHEN MATCHED THEN
|
||||
UPDATE SET tgt.code = '${code}', tgt.name = '${name}'
|
||||
WHEN NOT MATCHED THEN
|
||||
INSERT (hotel_chain_id, code, name)
|
||||
VALUES (${hotel_chain_id}, '${code}', '${name}')
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## PG-3: DIM_HOTEL — SCD Type 2
|
||||
|
||||
This is the analytically significant dimension. Hotels change star rating and chain affiliation over time (renovations, rebrandings). SCD Type 2 preserves history so reports can accurately show revenue by star category **at the time of booking**, not just today's category.
|
||||
|
||||
**Architecture:** NiFi stages raw data into `STG_HOTEL`, then an `ExecuteScript` runs the SCD2 SQL logic in a single Oracle transaction.
|
||||
|
||||
```
|
||||
[Truncate STG] → [Load STG from MySQL] → [Apply SCD2 SQL]
|
||||
```
|
||||
|
||||
### Step A: Truncate staging
|
||||
|
||||
**GenerateFlowFile** → **PutSQL**
|
||||
```sql
|
||||
TRUNCATE TABLE STG_HOTEL
|
||||
```
|
||||
|
||||
### Step B: Load staging from MySQL
|
||||
|
||||
**ExecuteSQL** — `MySQL_DBCPService`
|
||||
```sql
|
||||
SELECT
|
||||
h.hotel_id,
|
||||
hc.code AS chain_code,
|
||||
c.code AS country_code,
|
||||
sr.code AS star_code,
|
||||
h.code,
|
||||
h.name,
|
||||
h.city
|
||||
FROM hotel h
|
||||
JOIN country c ON c.country_id = h.country_id
|
||||
JOIN star_rating sr ON sr.star_rating_id = h.star_rating_id
|
||||
LEFT JOIN hotel_chain hc ON hc.hotel_chain_id = h.hotel_chain_id
|
||||
ORDER BY h.hotel_id
|
||||
```
|
||||
|
||||
**ConvertAvroToJSON** → **SplitJson** (`$.*`)
|
||||
|
||||
**EvaluateJsonPath** attributes: `hotel_id`, `chain_code`, `country_code`, `star_code`, `code`, `name`, `city`
|
||||
|
||||
**PutSQL** → `STG_HOTEL`
|
||||
```sql
|
||||
INSERT INTO STG_HOTEL (hotel_id, chain_code, country_code, star_code, code, name, city)
|
||||
VALUES (${hotel_id}, NULLIF('${chain_code}',''), '${country_code}', ${star_code}, '${code}', '${name}', '${city}')
|
||||
```
|
||||
|
||||
### Step C: Apply SCD2 logic
|
||||
|
||||
**GenerateFlowFile** (runs after B finishes) → **ExecuteScript** (Groovy)
|
||||
|
||||
The Groovy script opens a JDBC connection and executes two SQL statements in one transaction:
|
||||
|
||||
```groovy
|
||||
import java.sql.*
|
||||
|
||||
def conn = context.controllerServiceLookup
|
||||
.getControllerService('Oracle_DBCPService_ID')
|
||||
.getConnection()
|
||||
conn.autoCommit = false
|
||||
|
||||
try {
|
||||
// 1. Expire records whose tracked attributes changed
|
||||
conn.prepareStatement("""
|
||||
UPDATE DIM_HOTEL dh
|
||||
SET dh.expiry_date = TRUNC(SYSDATE) - 1,
|
||||
dh.is_current = 0
|
||||
WHERE dh.is_current = 1
|
||||
AND EXISTS (
|
||||
SELECT 1 FROM STG_HOTEL s
|
||||
WHERE s.hotel_id = dh.source_hotel_id
|
||||
AND (
|
||||
NVL(s.chain_code,'~') != NVL((
|
||||
SELECT hc.code FROM DIM_HOTEL_CHAIN hc
|
||||
WHERE hc.hotel_chain_key = dh.hotel_chain_key),'~')
|
||||
OR s.star_code != (
|
||||
SELECT ds.code FROM DIM_STAR_RATING ds
|
||||
WHERE ds.star_rating_key = dh.star_rating_key)
|
||||
OR s.city != dh.city
|
||||
)
|
||||
)
|
||||
""").executeUpdate()
|
||||
|
||||
// 2. Insert new version for changed hotels + insert brand-new hotels
|
||||
conn.prepareStatement("""
|
||||
INSERT INTO DIM_HOTEL (
|
||||
source_hotel_id, hotel_chain_key, country_key, star_rating_key,
|
||||
code, name, city, effective_date, expiry_date, is_current)
|
||||
SELECT
|
||||
s.hotel_id,
|
||||
(SELECT hc.hotel_chain_key FROM DIM_HOTEL_CHAIN hc WHERE hc.code = s.chain_code),
|
||||
(SELECT dc.country_key FROM DIM_COUNTRY dc WHERE dc.code = s.country_code),
|
||||
(SELECT ds.star_rating_key FROM DIM_STAR_RATING ds WHERE ds.code = s.star_code),
|
||||
s.code, s.name, s.city,
|
||||
TRUNC(SYSDATE), NULL, 1
|
||||
FROM STG_HOTEL s
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM DIM_HOTEL d
|
||||
WHERE d.source_hotel_id = s.hotel_id
|
||||
AND d.is_current = 1
|
||||
)
|
||||
""").executeUpdate()
|
||||
|
||||
conn.commit()
|
||||
} catch (Exception e) {
|
||||
conn.rollback()
|
||||
throw e
|
||||
} finally {
|
||||
conn.close()
|
||||
}
|
||||
def ff = session.create()
|
||||
session.transfer(ff, REL_SUCCESS)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## PG-4: DIM_GUEST — SCD Type 1
|
||||
|
||||
Guest personal data (city, country) can change without any analytical value in tracking the history. Plain MERGE/upsert is correct here.
|
||||
|
||||
```
|
||||
ExecuteSQL(MySQL) → ConvertAvroToJSON → SplitJson → EvaluateJsonPath → PutSQL(MERGE)
|
||||
```
|
||||
|
||||
**ExecuteSQL** — `MySQL_DBCPService`
|
||||
```sql
|
||||
SELECT g.guest_id, c.code AS country_code, g.name, g.city
|
||||
FROM guest g
|
||||
LEFT JOIN country c ON c.country_id = g.country_id
|
||||
ORDER BY g.guest_id
|
||||
```
|
||||
|
||||
**EvaluateJsonPath** attributes: `guest_id`, `country_code`, `name`, `city`
|
||||
|
||||
**PutSQL**
|
||||
```sql
|
||||
MERGE INTO DIM_GUEST tgt
|
||||
USING (SELECT 1 FROM DUAL) src ON (tgt.guest_id = ${guest_id})
|
||||
WHEN MATCHED THEN
|
||||
UPDATE SET
|
||||
tgt.country_key = (SELECT country_key FROM DIM_COUNTRY WHERE code = NULLIF('${country_code}','')),
|
||||
tgt.name = '${name}',
|
||||
tgt.city = NULLIF('${city}','')
|
||||
WHEN NOT MATCHED THEN
|
||||
INSERT (guest_id, country_key, name, city)
|
||||
VALUES (
|
||||
${guest_id},
|
||||
(SELECT country_key FROM DIM_COUNTRY WHERE code = NULLIF('${country_code}','')),
|
||||
'${name}',
|
||||
NULLIF('${city}','')
|
||||
)
|
||||
```
|
||||
|
||||
> **Note:** DIM_ROOM is also SCD Type 1 — load it the same way as DIM_GUEST, joining `hotel_room` with `room_type` in MySQL and MERGEing into `DIM_ROOM` (surrogate key lookup via `source_hotel_id + IS_CURRENT=1` from DIM_HOTEL).
|
||||
|
||||
---
|
||||
|
||||
## PG-5: FACT_ROOM_BOOKING — Incremental Load (Watermark)
|
||||
|
||||
The fact table is loaded **incrementally**: only `room_booking` rows with `room_booking_id` greater than the last loaded value are processed. The watermark is stored in `ETL_WATERMARK` in Oracle.
|
||||
|
||||
`source_rb_id` on `FACT_ROOM_BOOKING` has a UNIQUE constraint, so re-running is safe — duplicates are silently skipped.
|
||||
|
||||
```
|
||||
[Read Watermark] → [ExecuteSQL MySQL] → [ConvertAvroToJSON] → [SplitJson]
|
||||
→ [EvaluateJsonPath] → [PutSQL FACT] → [Update Watermark]
|
||||
```
|
||||
|
||||
### Step A: Read watermark
|
||||
|
||||
**ExecuteSQL** — `Oracle_DBCPService`
|
||||
```sql
|
||||
SELECT last_key FROM ETL_WATERMARK WHERE entity_name = 'FACT_ROOM_BOOKING'
|
||||
```
|
||||
|
||||
**ConvertAvroToJSON** → **EvaluateJsonPath**
|
||||
- `watermark` ← `$.last_key`
|
||||
|
||||
### Step B: Load from MySQL
|
||||
|
||||
**ExecuteSQL** — `MySQL_DBCPService`
|
||||
SQL Statement (use attribute `${watermark}`):
|
||||
```sql
|
||||
SELECT
|
||||
rb.room_booking_id,
|
||||
rb.room_id,
|
||||
rb.date_from,
|
||||
rb.date_to,
|
||||
rb.nightly_rate,
|
||||
rb.total_amount,
|
||||
b.guest_id,
|
||||
b.status AS booking_status,
|
||||
DATEDIFF(rb.date_to, rb.date_from) AS nights_stayed
|
||||
FROM room_booking rb
|
||||
JOIN booking b ON b.booking_id = rb.booking_id
|
||||
WHERE rb.room_booking_id > ${watermark}
|
||||
ORDER BY rb.room_booking_id
|
||||
LIMIT 50000
|
||||
```
|
||||
|
||||
> Set LIMIT to control batch size. Run PG-5 in a loop (using a Timer-driven GenerateFlowFile) until no rows come back.
|
||||
|
||||
### Step C: Split + extract attributes
|
||||
|
||||
**ConvertAvroToJSON** → **SplitJson** (`$.*`)
|
||||
|
||||
**EvaluateJsonPath** attributes:
|
||||
`room_booking_id`, `room_id`, `guest_id`, `date_from`, `date_to`, `nightly_rate`, `total_amount`, `booking_status`, `nights_stayed`
|
||||
|
||||
### Step D: Insert into fact table
|
||||
|
||||
**PutSQL** — `Oracle_DBCPService`
|
||||
```sql
|
||||
INSERT INTO FACT_ROOM_BOOKING (
|
||||
source_rb_id, hotel_key, hotel_chain_key, room_key, guest_key, country_key,
|
||||
star_rating_key, checkin_date_key, checkout_date_key,
|
||||
booking_status, nights_stayed, nightly_rate, total_amount)
|
||||
SELECT
|
||||
${room_booking_id},
|
||||
dh.hotel_key,
|
||||
dh.hotel_chain_key,
|
||||
dr.room_key,
|
||||
dg.guest_key,
|
||||
dg.country_key,
|
||||
dh.star_rating_key,
|
||||
TO_NUMBER(TO_CHAR(TO_DATE('${date_from}','YYYY-MM-DD'), 'YYYYMMDD')),
|
||||
TO_NUMBER(TO_CHAR(TO_DATE('${date_to}', 'YYYY-MM-DD'), 'YYYYMMDD')),
|
||||
'${booking_status}',
|
||||
${nights_stayed},
|
||||
TO_NUMBER('${nightly_rate}', '9999990D99', 'NLS_NUMERIC_CHARACTERS=''.,'''),
|
||||
TO_NUMBER('${total_amount}', '9999990D99', 'NLS_NUMERIC_CHARACTERS=''.,''')
|
||||
FROM
|
||||
DIM_ROOM dr,
|
||||
DIM_GUEST dg,
|
||||
DIM_HOTEL dh
|
||||
WHERE
|
||||
dr.room_id = ${room_id}
|
||||
AND dg.guest_id = ${guest_id}
|
||||
AND dh.hotel_key = dr.hotel_key
|
||||
-- SCD2 lookup: find hotel version active at check-in date
|
||||
AND dh.effective_date <= TO_DATE('${date_from}','YYYY-MM-DD')
|
||||
AND (dh.expiry_date IS NULL OR dh.expiry_date > TO_DATE('${date_from}','YYYY-MM-DD'))
|
||||
-- Idempotent: skip if already loaded
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM FACT_ROOM_BOOKING f WHERE f.source_rb_id = ${room_booking_id}
|
||||
)
|
||||
```
|
||||
|
||||
> The `DH.EFFECTIVE_DATE / EXPIRY_DATE` condition is the payoff of SCD Type 2: the fact row always references the hotel dimension version that was true **when the guest actually checked in**, not what the hotel looks like today.
|
||||
|
||||
**Ignore Errors** on PutSQL (route `failure` → funnel) — UNIQUE constraint violations on `source_rb_id` are expected and harmless on re-runs.
|
||||
|
||||
### Step E: Update watermark
|
||||
|
||||
After the PutSQL succeeds, update the watermark with the highest `room_booking_id` seen in this batch.
|
||||
|
||||
**UpdateAttribute**
|
||||
- `max_rb_id` ← `${room_booking_id}` (NiFi Expression Language `max()` across the batch via a custom processor or MergeContent trick)
|
||||
|
||||
> Simplest approach: add a final **ExecuteSQL** that runs after the batch:
|
||||
```sql
|
||||
UPDATE ETL_WATERMARK
|
||||
SET last_key = (SELECT MAX(source_rb_id) FROM FACT_ROOM_BOOKING),
|
||||
last_run_ts = SYSTIMESTAMP
|
||||
WHERE entity_name = 'FACT_ROOM_BOOKING'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Execution Order & Scheduling
|
||||
|
||||
| PG | Trigger | Frequency |
|
||||
|----|---------|-----------|
|
||||
| PG-1 (Date Dim) | Manual (run once) | — |
|
||||
| PG-2 (Static Dims) | Timer — 24h | Daily |
|
||||
| PG-3 (DIM_HOTEL SCD2) | Timer — 24h | Daily, after PG-2 |
|
||||
| PG-4 (DIM_GUEST SCD1) | Timer — 24h | Daily, after PG-3 |
|
||||
| PG-5 (Fact incremental) | Timer — 1h | Hourly |
|
||||
|
||||
Chain PG-2 → PG-3 → PG-4 by connecting each PG's Output Port to the next PG's Input Port via a **success** relationship.
|
||||
|
||||
---
|
||||
|
||||
## Why SCD Type 2 for DIM_HOTEL?
|
||||
|
||||
A hotel being upgraded from 3★ to 4★ changes its rate tier going forward. If we just overwrite the dimension (SCD1), all historical bookings would suddenly appear to have been made in a 4★ hotel — inflating average revenue per star category in reports. SCD2 preserves the correct picture: every fact row points to the exact hotel version that was true at check-in.
|
||||
540
generator/generate.cs
Normal file
540
generator/generate.cs
Normal file
@@ -0,0 +1,540 @@
|
||||
#:package MySqlConnector@2.3.7
|
||||
|
||||
using System.Text;
|
||||
using System.Globalization;
|
||||
using MySqlConnector;
|
||||
|
||||
// ── Config ────────────────────────────────────────────────────────────────────
|
||||
|
||||
const string DSN = "Server=127.0.0.1;Port=13306;Database=hotel_reservations;Uid=root;Pwd=hotel2025root;AllowLoadLocalInfile=true;";
|
||||
const int HOTEL_COUNT = 200;
|
||||
const int GUEST_COUNT = 100_000;
|
||||
const int BOOKING_COUNT = 500_000;
|
||||
const int BATCH = 500;
|
||||
const int SEED = 42;
|
||||
|
||||
var rng = new Random(SEED);
|
||||
|
||||
// ── DB helpers ────────────────────────────────────────────────────────────────
|
||||
|
||||
await using var conn = new MySqlConnection(DSN);
|
||||
await conn.OpenAsync();
|
||||
Console.WriteLine("Connected.");
|
||||
await new MySqlCommand("SET foreign_key_checks=0, unique_checks=0", conn).ExecuteNonQueryAsync();
|
||||
|
||||
async Task Exec(string sql)
|
||||
{
|
||||
await using var cmd = new MySqlCommand(sql, conn);
|
||||
cmd.CommandTimeout = 300;
|
||||
await cmd.ExecuteNonQueryAsync();
|
||||
}
|
||||
|
||||
async Task<long> ExecScalar(string sql)
|
||||
{
|
||||
await using var cmd = new MySqlCommand(sql, conn);
|
||||
return Convert.ToInt64(await cmd.ExecuteScalarAsync());
|
||||
}
|
||||
|
||||
// Bulk insert: builds a single INSERT ... VALUES (...),(...),...
|
||||
async Task BulkInsert(string table, string columns, List<string> valueTuples)
|
||||
{
|
||||
for (int i = 0; i < valueTuples.Count; i += BATCH)
|
||||
{
|
||||
var batch = valueTuples.Skip(i).Take(BATCH);
|
||||
await Exec($"INSERT INTO {table} ({columns}) VALUES {string.Join(',', batch)}");
|
||||
}
|
||||
}
|
||||
|
||||
string S(string? s) => s == null ? "NULL" : $"'{s.Replace("'", "''")}'";
|
||||
string N(object? n) => n == null ? "NULL" : (n is IFormattable f) ? f.ToString(null, CultureInfo.InvariantCulture) : n.ToString()!;
|
||||
string D(DateTime d) => $"'{d:yyyy-MM-dd}'";
|
||||
string DT(DateTime d) => $"'{d:yyyy-MM-dd HH:mm:ss}'";
|
||||
|
||||
// ── Reference data ────────────────────────────────────────────────────────────
|
||||
|
||||
// ── 1. hotel_chain ────────────────────────────────────────────────────────────
|
||||
Console.WriteLine("[1/8] hotel_chain");
|
||||
|
||||
var chains = new (string Code, string Name)[]
|
||||
{
|
||||
("HLT", "Hilton Worldwide"),
|
||||
("MRT", "Marriott International"),
|
||||
("HYT", "Hyatt Hotels Corporation"),
|
||||
("IHG", "InterContinental Hotels Group"),
|
||||
("WYN", "Wyndham Hotels & Resorts"),
|
||||
("ACC", "Accor"),
|
||||
("BW", "Best Western Hotels"),
|
||||
("RAD", "Radisson Hotels"),
|
||||
("MEL", "Meliá Hotels International"),
|
||||
("NH", "NH Hotel Group"),
|
||||
};
|
||||
|
||||
await BulkInsert("hotel_chain", "code, name",
|
||||
chains.Select(c => $"({S(c.Code)},{S(c.Name)})").ToList());
|
||||
|
||||
var chainIds = new Dictionary<string, int>();
|
||||
{
|
||||
await using var cmd = new MySqlCommand("SELECT hotel_chain_id, code FROM hotel_chain", conn);
|
||||
await using var r = await cmd.ExecuteReaderAsync();
|
||||
while (await r.ReadAsync()) chainIds[r.GetString(1)] = r.GetInt32(0);
|
||||
}
|
||||
|
||||
// ── 2. country ────────────────────────────────────────────────────────────────
|
||||
Console.WriteLine("[2/8] country");
|
||||
|
||||
var countries = new (string Code, string Name, string Currency)[]
|
||||
{
|
||||
("GB","United Kingdom","GBP"), ("FR","France","EUR"), ("DE","Germany","EUR"),
|
||||
("ES","Spain","EUR"), ("IT","Italy","EUR"), ("PT","Portugal","EUR"),
|
||||
("NL","Netherlands","EUR"), ("BE","Belgium","EUR"), ("AT","Austria","EUR"),
|
||||
("CH","Switzerland","CHF"), ("SE","Sweden","SEK"), ("NO","Norway","NOK"),
|
||||
("DK","Denmark","DKK"), ("PL","Poland","PLN"), ("CZ","Czech Republic","CZK"),
|
||||
("HU","Hungary","HUF"), ("HR","Croatia","EUR"), ("GR","Greece","EUR"),
|
||||
("TR","Turkey","TRY"), ("US","United States","USD"), ("CA","Canada","CAD"),
|
||||
("MX","Mexico","MXN"), ("BR","Brazil","BRL"), ("AR","Argentina","ARS"),
|
||||
("AU","Australia","AUD"), ("NZ","New Zealand","NZD"), ("JP","Japan","JPY"),
|
||||
("CN","China","CNY"), ("KR","South Korea","KRW"), ("SG","Singapore","SGD"),
|
||||
("TH","Thailand","THB"), ("AE","United Arab Emirates","AED"),
|
||||
("SA","Saudi Arabia","SAR"), ("EG","Egypt","EGP"), ("ZA","South Africa","ZAR"),
|
||||
("IN","India","INR"), ("MA","Morocco","MAD"), ("TN","Tunisia","TND"),
|
||||
("ID","Indonesia","IDR"), ("MY","Malaysia","MYR"),
|
||||
};
|
||||
|
||||
await BulkInsert("country", "code, name, currency",
|
||||
countries.Select(c => $"({S(c.Code)},{S(c.Name)},{S(c.Currency)})").ToList());
|
||||
|
||||
var countryIds = new Dictionary<string, int>();
|
||||
{
|
||||
await using var cmd = new MySqlCommand("SELECT country_id, code FROM country", conn);
|
||||
await using var r = await cmd.ExecuteReaderAsync();
|
||||
while (await r.ReadAsync()) countryIds[r.GetString(1)] = r.GetInt32(0);
|
||||
}
|
||||
|
||||
// ── 3. star_rating ────────────────────────────────────────────────────────────
|
||||
Console.WriteLine("[3/8] star_rating");
|
||||
|
||||
await BulkInsert("star_rating", "code, description",
|
||||
Enumerable.Range(1, 5).Select(i => $"({i},{S(i + " Star")})").ToList());
|
||||
|
||||
var starIds = new Dictionary<int, int>();
|
||||
{
|
||||
await using var cmd = new MySqlCommand("SELECT star_rating_id, code FROM star_rating", conn);
|
||||
await using var r = await cmd.ExecuteReaderAsync();
|
||||
while (await r.ReadAsync()) starIds[r.GetInt32(1)] = r.GetInt32(0);
|
||||
}
|
||||
|
||||
// ── 4. hotel_characteristic ───────────────────────────────────────────────────
|
||||
Console.WriteLine("[4/8] hotel_characteristic");
|
||||
|
||||
var characteristics = new (string Code, string Desc)[]
|
||||
{
|
||||
("WIFI", "Free WiFi"), ("POOL", "Swimming Pool"),
|
||||
("GYM", "Fitness Center"), ("SPA", "Spa & Wellness"),
|
||||
("RESTAURANT", "On-site Restaurant"), ("BAR", "Hotel Bar"),
|
||||
("PARKING", "Free Parking"), ("VALET", "Valet Parking"),
|
||||
("CONFERENCE", "Conference Rooms"), ("SHUTTLE", "Airport Shuttle"),
|
||||
("ROOM_SVC", "Room Service"), ("PETS", "Pet Friendly"),
|
||||
};
|
||||
|
||||
await BulkInsert("hotel_characteristic", "code, description",
|
||||
characteristics.Select(c => $"({S(c.Code)},{S(c.Desc)})").ToList());
|
||||
|
||||
var charIds = new Dictionary<string, int>();
|
||||
{
|
||||
await using var cmd = new MySqlCommand("SELECT characteristic_id, code FROM hotel_characteristic", conn);
|
||||
await using var r = await cmd.ExecuteReaderAsync();
|
||||
while (await r.ReadAsync()) charIds[r.GetString(1)] = r.GetInt32(0);
|
||||
}
|
||||
|
||||
// ── 5. room_type + rate_period + period_room_rate ─────────────────────────────
|
||||
Console.WriteLine("[5/8] room_type / rate_period / period_room_rate");
|
||||
|
||||
var roomTypes = new (string Code, string Desc, decimal BaseRate, bool Smoking)[]
|
||||
{
|
||||
("SINGLE", "Single Room", 80m, false),
|
||||
("DOUBLE", "Double Room", 120m, false),
|
||||
("TWIN", "Twin Room", 115m, false),
|
||||
("DELUXE", "Deluxe Double", 180m, false),
|
||||
("SUITE", "Junior Suite", 280m, false),
|
||||
("EXEC", "Executive Suite", 450m, false),
|
||||
("FAMILY", "Family Room", 200m, false),
|
||||
};
|
||||
|
||||
await BulkInsert("room_type", "code, description, standard_rate, smoking_yn",
|
||||
roomTypes.Select(rt => $"({S(rt.Code)},{S(rt.Desc)},{N(rt.BaseRate)},0)").ToList());
|
||||
|
||||
var roomTypeIds = new Dictionary<string, int>();
|
||||
{
|
||||
await using var cmd = new MySqlCommand("SELECT room_type_id, code FROM room_type", conn);
|
||||
await using var r = await cmd.ExecuteReaderAsync();
|
||||
while (await r.ReadAsync()) roomTypeIds[r.GetString(1)] = r.GetInt32(0);
|
||||
}
|
||||
|
||||
// Seasons: month → multiplier
|
||||
var ratePeriods = new (string Code, string Desc, int MonthFrom, int MonthTo, decimal Multiplier)[]
|
||||
{
|
||||
("PEAK", "Peak Season (Jun-Aug)", 6, 8, 1.5m),
|
||||
("HIGH", "High Season (Mar-May)", 3, 5, 1.2m),
|
||||
("AUTUMN", "Autumn Season (Sep-Nov)", 9, 11, 1.1m),
|
||||
("WINTER", "Winter Season (Dec-Feb)", 12, 2, 0.9m),
|
||||
};
|
||||
|
||||
await BulkInsert("rate_period", "code, description, month_from, month_to",
|
||||
ratePeriods.Select(rp => $"({S(rp.Code)},{S(rp.Desc)},{rp.MonthFrom},{rp.MonthTo})").ToList());
|
||||
|
||||
var ratePeriodIds = new Dictionary<string, int>();
|
||||
{
|
||||
await using var cmd = new MySqlCommand("SELECT rate_period_id, code FROM rate_period", conn);
|
||||
await using var r = await cmd.ExecuteReaderAsync();
|
||||
while (await r.ReadAsync()) ratePeriodIds[r.GetString(1)] = r.GetInt32(0);
|
||||
}
|
||||
|
||||
// period_room_rate: rate = base_rate * season_multiplier
|
||||
var prrRows = new List<string>();
|
||||
foreach (var rt in roomTypes)
|
||||
foreach (var rp in ratePeriods)
|
||||
{
|
||||
var rate = Math.Round(rt.BaseRate * rp.Multiplier, 2);
|
||||
prrRows.Add($"({roomTypeIds[rt.Code]},{ratePeriodIds[rp.Code]},{N(rate)})");
|
||||
}
|
||||
await BulkInsert("period_room_rate", "room_type_id, rate_period_id, rate", prrRows);
|
||||
|
||||
// Build month → rate lookup in memory
|
||||
var monthToRatePeriodId = new Dictionary<int, int>();
|
||||
foreach (var rp in ratePeriods)
|
||||
{
|
||||
if (rp.MonthFrom <= rp.MonthTo)
|
||||
for (int m = rp.MonthFrom; m <= rp.MonthTo; m++)
|
||||
monthToRatePeriodId[m] = ratePeriodIds[rp.Code];
|
||||
else // wraps year (Dec-Feb)
|
||||
{
|
||||
for (int m = rp.MonthFrom; m <= 12; m++) monthToRatePeriodId[m] = ratePeriodIds[rp.Code];
|
||||
for (int m = 1; m <= rp.MonthTo; m++) monthToRatePeriodId[m] = ratePeriodIds[rp.Code];
|
||||
}
|
||||
}
|
||||
|
||||
// Build (room_type_id, rate_period_id) → rate lookup
|
||||
var rateMap = new Dictionary<(int, int), decimal>();
|
||||
foreach (var rt in roomTypes)
|
||||
foreach (var rp in ratePeriods)
|
||||
rateMap[(roomTypeIds[rt.Code], ratePeriodIds[rp.Code])] =
|
||||
Math.Round(rt.BaseRate * rp.Multiplier, 2);
|
||||
|
||||
// ── 6. hotel + hotel_room + hotel_hotel_characteristic ────────────────────────
|
||||
Console.WriteLine("[6/8] hotel / hotel_room / hotel_hotel_characteristic");
|
||||
|
||||
var hotelCities = new (string City, string Country)[]
|
||||
{
|
||||
("London","GB"), ("Manchester","GB"), ("Edinburgh","GB"),
|
||||
("Paris","FR"), ("Lyon","FR"), ("Nice","FR"),
|
||||
("Berlin","DE"), ("Munich","DE"), ("Hamburg","DE"),
|
||||
("Madrid","ES"), ("Barcelona","ES"), ("Seville","ES"),
|
||||
("Rome","IT"), ("Milan","IT"), ("Florence","IT"),
|
||||
("Lisbon","PT"), ("Porto","PT"), ("Amsterdam","NL"),
|
||||
("Vienna","AT"), ("Zurich","CH"), ("Geneva","CH"),
|
||||
("Stockholm","SE"), ("Oslo","NO"), ("Copenhagen","DK"),
|
||||
("Warsaw","PL"), ("Prague","CZ"), ("Budapest","HU"),
|
||||
("Athens","GR"), ("Istanbul","TR"), ("New York","US"),
|
||||
("Los Angeles","US"), ("Miami","US"), ("Chicago","US"),
|
||||
("Toronto","CA"), ("Vancouver","CA"), ("Sydney","AU"),
|
||||
("Melbourne","AU"), ("Tokyo","JP"), ("Osaka","JP"),
|
||||
("Singapore","SG"), ("Bangkok","TH"), ("Dubai","AE"),
|
||||
("Mumbai","IN"), ("Cape Town","ZA"), ("Marrakech","MA"),
|
||||
("Cairo","EG"), ("Cancun","MX"), ("Rio de Janeiro","BR"),
|
||||
("Seoul","KR"), ("Kuala Lumpur","MY"),
|
||||
};
|
||||
|
||||
// Star rating distribution: 3★ most common, 5★ rarest
|
||||
int[] starWeights = [0, 5, 10, 40, 30, 15]; // index = star, value = weight
|
||||
int PickStar()
|
||||
{
|
||||
int roll = rng.Next(100);
|
||||
int cum = 0;
|
||||
for (int s = 1; s <= 5; s++) { cum += starWeights[s]; if (roll < cum) return s; }
|
||||
return 3;
|
||||
}
|
||||
|
||||
var hotelRows = new List<string>();
|
||||
var roomRows = new List<string>();
|
||||
var hotelCharRows = new List<string>();
|
||||
var charCodes = characteristics.Select(c => c.Code).ToArray();
|
||||
|
||||
// Track room_type per hotel_room for later rate lookups (in-memory)
|
||||
// hotel_room gets an auto-increment ID; we'll load them after insert
|
||||
// So store: hotel index → list of (room_number, room_type_code)
|
||||
var hotelRoomTypes = new List<(int hotelIndex, string roomNumber, string roomTypeCode)>();
|
||||
|
||||
string[] streetNames = ["Main St","Park Ave","King Rd","Grand Blvd","Lake Dr",
|
||||
"Ocean Blvd","Hill Rd","Market St","Central Ave","Palace Rd"];
|
||||
|
||||
for (int h = 0; h < HOTEL_COUNT; h++)
|
||||
{
|
||||
var (city, ctryCode) = hotelCities[h % hotelCities.Length];
|
||||
int chainIndex = rng.Next(chains.Length);
|
||||
// 20% of hotels are independent (no chain)
|
||||
int? chainId = rng.Next(100) < 20 ? null : chainIds[chains[chainIndex].Code];
|
||||
int star = PickStar();
|
||||
int starId = starIds[star];
|
||||
int ctryId = countryIds[ctryCode];
|
||||
|
||||
string code = $"HTL{h+1:D4}";
|
||||
string name = chainId == null
|
||||
? $"The {city} Hotel"
|
||||
: $"{chains[chainIndex].Name.Split(' ')[0]} {city}";
|
||||
string addr = $"{rng.Next(1, 200)} {streetNames[rng.Next(streetNames.Length)]}";
|
||||
string url = $"https://www.{code.ToLower()}.example.com";
|
||||
|
||||
hotelRows.Add($"({N(chainId)},{ctryId},{starId},{S(code)},{S(name)},{S(addr)},{S("00000")},{S(city)},{S(url)})");
|
||||
|
||||
// Characteristics: 5★ gets all, lower stars get fewer
|
||||
int charCount = star switch { 5 => 11, 4 => 8, 3 => 6, 2 => 4, _ => 3 };
|
||||
var shuffled = charCodes.OrderBy(_ => rng.Next()).Take(charCount).ToArray();
|
||||
// Store char codes for later — we need hotel_id from DB first
|
||||
// Mark with h as placeholder; we'll match after insert
|
||||
foreach (var cc in shuffled)
|
||||
hotelCharRows.Add($"__HOTEL_{h}__,{charIds[cc]}");
|
||||
|
||||
// Rooms: more rooms for higher star hotels
|
||||
int roomCount = star switch { 5 => rng.Next(40, 60), 4 => rng.Next(25, 40),
|
||||
3 => rng.Next(15, 25), 2 => rng.Next(8, 15), _ => rng.Next(5, 10) };
|
||||
|
||||
// Room type distribution per star rating
|
||||
string[] typePool = star switch
|
||||
{
|
||||
5 => ["DOUBLE","DOUBLE","DELUXE","DELUXE","SUITE","SUITE","EXEC","FAMILY"],
|
||||
4 => ["SINGLE","DOUBLE","DOUBLE","DELUXE","SUITE","FAMILY"],
|
||||
3 => ["SINGLE","SINGLE","DOUBLE","DOUBLE","TWIN","FAMILY"],
|
||||
2 => ["SINGLE","SINGLE","DOUBLE","TWIN"],
|
||||
_ => ["SINGLE","SINGLE","DOUBLE"],
|
||||
};
|
||||
|
||||
for (int r = 0; r < roomCount; r++)
|
||||
{
|
||||
int floor = r / 10 + 1;
|
||||
string rnum = $"{floor}{(r % 10 + 1):D2}";
|
||||
string rtype = typePool[rng.Next(typePool.Length)];
|
||||
// Store for later (after we get real hotel IDs from DB)
|
||||
hotelRoomTypes.Add((h, rnum, rtype));
|
||||
}
|
||||
}
|
||||
|
||||
await BulkInsert("hotel",
|
||||
"hotel_chain_id, country_id, star_rating_id, code, name, address, postcode, city, url",
|
||||
hotelRows);
|
||||
|
||||
// Load hotel IDs in order
|
||||
var hotelIds = new List<int>();
|
||||
{
|
||||
await using var cmd = new MySqlCommand("SELECT hotel_id FROM hotel ORDER BY hotel_id", conn);
|
||||
await using var r = await cmd.ExecuteReaderAsync();
|
||||
while (await r.ReadAsync()) hotelIds.Add(r.GetInt32(0));
|
||||
}
|
||||
|
||||
// Now build hotel_room rows with real hotel IDs
|
||||
foreach (var (hIdx, rnum, rtype) in hotelRoomTypes)
|
||||
roomRows.Add($"({hotelIds[hIdx]},{roomTypeIds[rtype]},{S(rnum)},{rnum[0] - '0'})");
|
||||
|
||||
await BulkInsert("hotel_room", "hotel_id, room_type_id, room_number, floor", roomRows);
|
||||
|
||||
// hotel_hotel_characteristic — replace placeholder with real hotel_id
|
||||
var hhcRows = hotelCharRows
|
||||
.Select(row => {
|
||||
var parts = row.Split(',');
|
||||
var hIdx = int.Parse(parts[0].Replace("__HOTEL_", "").Replace("__", ""));
|
||||
var charId = parts[1];
|
||||
return $"({hotelIds[hIdx]},{charId})";
|
||||
})
|
||||
.Distinct()
|
||||
.ToList();
|
||||
await BulkInsert("hotel_hotel_characteristic", "hotel_id, characteristic_id", hhcRows);
|
||||
|
||||
// Load rooms into memory: hotel_id → list of (room_id, room_type_id)
|
||||
var hotelRooms = new Dictionary<int, List<(int RoomId, int RoomTypeId)>>();
|
||||
{
|
||||
await using var cmd = new MySqlCommand("SELECT room_id, hotel_id, room_type_id FROM hotel_room", conn);
|
||||
await using var r = await cmd.ExecuteReaderAsync();
|
||||
while (await r.ReadAsync())
|
||||
{
|
||||
int rid = r.GetInt32(0), hid = r.GetInt32(1), rtid = r.GetInt32(2);
|
||||
if (!hotelRooms.ContainsKey(hid)) hotelRooms[hid] = [];
|
||||
hotelRooms[hid].Add((rid, rtid));
|
||||
}
|
||||
}
|
||||
|
||||
// ── 7. guest ──────────────────────────────────────────────────────────────────
|
||||
Console.WriteLine("[7/8] guest");
|
||||
|
||||
string[] firstNames =
|
||||
[
|
||||
"James","Mary","John","Patricia","Robert","Jennifer","Michael","Linda","William","Barbara",
|
||||
"David","Elizabeth","Richard","Susan","Joseph","Jessica","Thomas","Sarah","Charles","Karen",
|
||||
"Luca","Sofia","Marco","Giulia","Hans","Anna","Klaus","Maria","Pierre","Marie","Jean","Claire",
|
||||
"Miguel","Ana","Carlos","Carmen","Andrei","Ioana","Mihai","Elena","Tomasz","Agnieszka",
|
||||
"Dimitri","Eleni","Mehmet","Fatima","Yuki","Kenji","Haruto","Yuna","Wei","Fang","Li","Mei",
|
||||
"Ahmed","Layla","Omar","Nour","Raj","Priya","Arjun","Ananya","Lucas","Emma","Noah","Olivia",
|
||||
"Ethan","Ava","Mason","Isabella","Liam","Sophia","Oliver","Charlotte","Elijah","Amelia",
|
||||
];
|
||||
|
||||
string[] lastNames =
|
||||
[
|
||||
"Smith","Johnson","Williams","Brown","Jones","Garcia","Miller","Davis","Wilson","Moore",
|
||||
"Taylor","Anderson","Thomas","Jackson","White","Harris","Martin","Thompson","Young","Lee",
|
||||
"Rossi","Ferrari","Esposito","Romano","Müller","Schmidt","Fischer","Weber","Meyer","Wagner",
|
||||
"Dupont","Martin","Bernard","Petit","Dubois","Moreau","Laurent","Simon","Michel","Garcia",
|
||||
"Kowalski","Nowak","Wiśniewski","Wójcik","Kowalczyk","Kamiński","Lewandowski","Zieliński",
|
||||
"Papadopoulos","Georgiou","Yilmaz","Kaya","Tanaka","Sato","Suzuki","Watanabe","Ito","Yamamoto",
|
||||
"Wang","Li","Zhang","Liu","Chen","Yang","Huang","Zhao","Kim","Park","Lee","Choi","Patel",
|
||||
"Singh","Kumar","Sharma","Gupta","Ali","Hassan","Ahmed","Mohamed","Silva","Santos","Oliveira",
|
||||
];
|
||||
|
||||
string[] guestCities =
|
||||
[
|
||||
"London","Paris","Berlin","Madrid","Rome","Amsterdam","Vienna","Zurich","Brussels","Stockholm",
|
||||
"New York","Los Angeles","Chicago","Houston","Phoenix","Toronto","Vancouver","Sydney","Melbourne",
|
||||
"Tokyo","Seoul","Beijing","Shanghai","Singapore","Bangkok","Dubai","Mumbai","Cape Town",
|
||||
"Warsaw","Prague","Budapest","Athens","Istanbul","Lisbon","Oslo","Copenhagen","Helsinki",
|
||||
];
|
||||
|
||||
var countryList = countries.Select(c => c.Code).ToArray();
|
||||
|
||||
var guestRows = new List<string>();
|
||||
for (int g = 0; g < GUEST_COUNT; g++)
|
||||
{
|
||||
string fn = firstNames[rng.Next(firstNames.Length)];
|
||||
string ln = lastNames[rng.Next(lastNames.Length)];
|
||||
string name = $"{fn} {ln}";
|
||||
string email = $"{fn.ToLower()}.{ln.ToLower()}{rng.Next(100, 999)}@example.com";
|
||||
string city = guestCities[rng.Next(guestCities.Length)];
|
||||
int ctryId = countryIds[countryList[rng.Next(countryList.Length)]];
|
||||
guestRows.Add($"({ctryId},{S(name)},{S(email)},{S(city)})");
|
||||
}
|
||||
await BulkInsert("guest", "country_id, name, email, city", guestRows);
|
||||
|
||||
var guestIdMin = (int)await ExecScalar("SELECT MIN(guest_id) FROM guest");
|
||||
var guestIdMax = (int)await ExecScalar("SELECT MAX(guest_id) FROM guest");
|
||||
|
||||
// ── 8. booking + room_booking ─────────────────────────────────────────────────
|
||||
Console.WriteLine("[8/8] booking + room_booking");
|
||||
|
||||
var dateStart = new DateTime(2022, 1, 1);
|
||||
var dateEnd = new DateTime(2025, 12, 31);
|
||||
int dateRange = (dateEnd - dateStart).Days;
|
||||
|
||||
// Seasonal weight: month → weight (higher = more bookings)
|
||||
int[] monthWeight = [0, 6, 5, 7, 9, 10, 14, 16, 15, 11, 9, 7, 11]; // Jan-Dec
|
||||
|
||||
DateTime RandomCheckin()
|
||||
{
|
||||
// Rejection sampling to simulate seasonal distribution
|
||||
while (true)
|
||||
{
|
||||
var d = dateStart.AddDays(rng.Next(dateRange));
|
||||
if (rng.Next(16) < monthWeight[d.Month]) return d;
|
||||
}
|
||||
}
|
||||
|
||||
int RandomNights() => rng.Next(100) switch
|
||||
{
|
||||
< 30 => 1,
|
||||
< 55 => 2,
|
||||
< 75 => 3,
|
||||
< 85 => 4,
|
||||
< 92 => 5,
|
||||
< 96 => rng.Next(6, 8),
|
||||
_ => rng.Next(8, 15),
|
||||
};
|
||||
|
||||
string RandomStatus() => rng.Next(100) switch
|
||||
{
|
||||
< 80 => "completed",
|
||||
< 90 => "confirmed",
|
||||
< 97 => "cancelled",
|
||||
_ => "no_show",
|
||||
};
|
||||
|
||||
// 90% single room, 8% two rooms, 2% three rooms
|
||||
int RandomRoomCount() => rng.Next(100) switch { < 90 => 1, < 98 => 2, _ => 3 };
|
||||
|
||||
var bookingRows = new List<string>();
|
||||
var roomBookingRows = new List<string>();
|
||||
|
||||
// We need booking_id for room_booking FK.
|
||||
// Strategy: flush bookings in batches, then read back the auto-increment IDs,
|
||||
// then insert room_bookings for that batch.
|
||||
|
||||
int bookingsDone = 0;
|
||||
while (bookingsDone < BOOKING_COUNT)
|
||||
{
|
||||
int batchSize = Math.Min(BATCH, BOOKING_COUNT - bookingsDone);
|
||||
bookingRows.Clear();
|
||||
roomBookingRows.Clear();
|
||||
|
||||
for (int b = 0; b < batchSize; b++)
|
||||
{
|
||||
int guestId = guestIdMin + rng.Next(guestIdMax - guestIdMin + 1);
|
||||
int hotelId = hotelIds[rng.Next(hotelIds.Count)];
|
||||
DateTime checkin = RandomCheckin();
|
||||
int nights = RandomNights();
|
||||
DateTime checkout = checkin.AddDays(nights);
|
||||
string status = RandomStatus();
|
||||
DateTime created = checkin.AddDays(-rng.Next(1, 180));
|
||||
|
||||
bookingRows.Add($"({guestId},{hotelId},{D(checkin)},{D(checkout)},{S(status)},{DT(created)})");
|
||||
}
|
||||
|
||||
await Exec($"INSERT INTO booking (guest_id, hotel_id, date_from, date_to, status, created_at) VALUES {string.Join(',', bookingRows)}");
|
||||
long firstId = await ExecScalar("SELECT LAST_INSERT_ID()");
|
||||
|
||||
// Re-derive checkin/nights from the same rng sequence is impossible after the fact,
|
||||
// so re-parse from inserted rows to build room_bookings
|
||||
// Simpler: re-read the batch back
|
||||
await using (var cmd = new MySqlCommand(
|
||||
$"SELECT booking_id, hotel_id, date_from, date_to, status FROM booking WHERE booking_id >= {firstId} ORDER BY booking_id", conn))
|
||||
await using (var reader = await cmd.ExecuteReaderAsync())
|
||||
{
|
||||
while (await reader.ReadAsync())
|
||||
{
|
||||
long bookingId = reader.GetInt64(0);
|
||||
int hid = reader.GetInt32(1);
|
||||
DateTime dfrom = reader.GetDateTime(2);
|
||||
DateTime dto = reader.GetDateTime(3);
|
||||
string status = reader.GetString(4);
|
||||
int nights = (dto - dfrom).Days;
|
||||
|
||||
if (!hotelRooms.ContainsKey(hid) || hotelRooms[hid].Count == 0) continue;
|
||||
|
||||
// Skip room_bookings for cancelled/no_show sometimes
|
||||
if (status == "cancelled" && rng.Next(100) < 60) continue;
|
||||
if (status == "no_show" && rng.Next(100) < 30) continue;
|
||||
|
||||
int roomCount = RandomRoomCount();
|
||||
var available = hotelRooms[hid].OrderBy(_ => rng.Next()).Take(roomCount).ToList();
|
||||
|
||||
foreach (var (roomId, roomTypeId) in available)
|
||||
{
|
||||
int ratePeriodId = monthToRatePeriodId[dfrom.Month];
|
||||
decimal nightly = rateMap[(roomTypeId, ratePeriodId)];
|
||||
decimal total = Math.Round(nightly * nights, 2);
|
||||
roomBookingRows.Add($"({bookingId},{roomId},{D(dfrom)},{D(dto)},{N(nightly)},{N(total)})");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (roomBookingRows.Count > 0)
|
||||
await Exec($"INSERT INTO room_booking (booking_id, room_id, date_from, date_to, nightly_rate, total_amount) VALUES {string.Join(',', roomBookingRows)}");
|
||||
|
||||
bookingsDone += batchSize;
|
||||
if (bookingsDone % 10_000 == 0)
|
||||
Console.WriteLine($" bookings: {bookingsDone:N0} / {BOOKING_COUNT:N0}");
|
||||
}
|
||||
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("── Row counts ───────────────────────────────");
|
||||
foreach (var t in new[]{"hotel_chain","country","star_rating","hotel_characteristic",
|
||||
"room_type","rate_period","period_room_rate","hotel",
|
||||
"hotel_room","hotel_hotel_characteristic","guest","booking","room_booking"})
|
||||
{
|
||||
long cnt = await ExecScalar($"SELECT COUNT(*) FROM {t}");
|
||||
Console.WriteLine($" {t,-35} {cnt,10:N0}");
|
||||
}
|
||||
Console.WriteLine("Done.");
|
||||
187
sql/datamart_schema.sql
Normal file
187
sql/datamart_schema.sql
Normal file
@@ -0,0 +1,187 @@
|
||||
create table ETL_WATERMARK
|
||||
(
|
||||
ENTITY_NAME VARCHAR2(50) not null
|
||||
constraint PK_ETL_WATERMARK
|
||||
primary key,
|
||||
LAST_KEY NUMBER(20) default 0 not null,
|
||||
LAST_RUN_TS TIMESTAMP(6) default SYSTIMESTAMP
|
||||
)
|
||||
/
|
||||
|
||||
create table STG_HOTEL
|
||||
(
|
||||
HOTEL_ID NUMBER(10) not null,
|
||||
HOTEL_CODE VARCHAR2(20) not null,
|
||||
HOTEL_NAME VARCHAR2(150) not null,
|
||||
CITY VARCHAR2(100) not null,
|
||||
COUNTRY_CODE CHAR(2) not null,
|
||||
COUNTRY_NAME VARCHAR2(100) not null,
|
||||
CURRENCY VARCHAR2(10) not null,
|
||||
CHAIN_CODE VARCHAR2(10),
|
||||
CHAIN_NAME VARCHAR2(100),
|
||||
STAR_RATING NUMBER(1) not null,
|
||||
STAR_DESCRIPTION VARCHAR2(20)
|
||||
)
|
||||
/
|
||||
|
||||
create table DIM_DATE
|
||||
(
|
||||
DATE_KEY NUMBER(8) not null
|
||||
constraint PK_DIM_DATE
|
||||
primary key,
|
||||
FULL_DATE DATE not null,
|
||||
YEAR NUMBER(4) not null,
|
||||
QUARTER NUMBER(1) not null,
|
||||
MONTH NUMBER(2) not null,
|
||||
MONTH_NAME VARCHAR2(10) not null,
|
||||
WEEK_NUMBER NUMBER(2) not null,
|
||||
DAY_OF_MONTH NUMBER(2) not null,
|
||||
DAY_NAME VARCHAR2(10) not null,
|
||||
IS_WEEKEND NUMBER(1) not null
|
||||
constraint CK_DIM_DATE_WEEKEND
|
||||
check (is_weekend IN (0, 1)),
|
||||
IS_BUSINESS_DAY NUMBER(1) not null
|
||||
constraint CK_DIM_DATE_BUSINESS
|
||||
check (is_business_day IN (0, 1)),
|
||||
SEASON VARCHAR2(10) not null
|
||||
)
|
||||
/
|
||||
|
||||
create table DIM_HOTEL
|
||||
(
|
||||
HOTEL_KEY NUMBER(10) default "IPZ19438"."ISEQ$$_303891".nextval generated as identity
|
||||
constraint PK_DIM_HOTEL
|
||||
primary key,
|
||||
SOURCE_HOTEL_ID NUMBER(10) not null,
|
||||
HOTEL_CODE VARCHAR2(20) not null,
|
||||
HOTEL_NAME VARCHAR2(150) not null,
|
||||
CITY VARCHAR2(100) not null,
|
||||
COUNTRY_CODE CHAR(2) not null,
|
||||
COUNTRY_NAME VARCHAR2(100) not null,
|
||||
CURRENCY VARCHAR2(10) not null,
|
||||
CHAIN_CODE VARCHAR2(10),
|
||||
CHAIN_NAME VARCHAR2(100),
|
||||
STAR_RATING NUMBER(1) not null,
|
||||
STAR_DESCRIPTION VARCHAR2(20),
|
||||
EFFECTIVE_DATE DATE not null,
|
||||
EXPIRY_DATE DATE,
|
||||
IS_CURRENT NUMBER(1) default 1 not null
|
||||
constraint CK_DIM_HOTEL_CURRENT
|
||||
check (is_current IN (0, 1))
|
||||
)
|
||||
/
|
||||
|
||||
create table DIM_ROOM
|
||||
(
|
||||
ROOM_KEY NUMBER(10) generated as identity
|
||||
constraint PK_DIM_ROOM
|
||||
primary key,
|
||||
SOURCE_ROOM_ID NUMBER(10) not null
|
||||
constraint UQ_DIM_ROOM
|
||||
unique,
|
||||
HOTEL_KEY NUMBER(10) not null
|
||||
constraint FK_DIM_ROOM_HOTEL
|
||||
references DIM_HOTEL,
|
||||
ROOM_NUMBER VARCHAR2(10) not null,
|
||||
FLOOR NUMBER(3) not null,
|
||||
ROOM_TYPE_CODE VARCHAR2(20) not null,
|
||||
ROOM_TYPE_DESCRIPTION VARCHAR2(100) not null,
|
||||
SMOKING_YN NUMBER(1) not null
|
||||
constraint CK_DIM_ROOM_SMOKING
|
||||
check (smoking_yn IN (0, 1)),
|
||||
STANDARD_RATE NUMBER(10, 2) not null
|
||||
)
|
||||
/
|
||||
|
||||
create table DIM_GUEST
|
||||
(
|
||||
GUEST_KEY NUMBER(10) generated as identity
|
||||
constraint PK_DIM_GUEST
|
||||
primary key,
|
||||
SOURCE_GUEST_ID NUMBER(10) not null
|
||||
constraint UQ_DIM_GUEST
|
||||
unique,
|
||||
GUEST_NAME VARCHAR2(150) not null,
|
||||
CITY VARCHAR2(100),
|
||||
COUNTRY_CODE CHAR(2),
|
||||
COUNTRY_NAME VARCHAR2(100)
|
||||
)
|
||||
/
|
||||
|
||||
create table FACT_ROOM_BOOKING
|
||||
(
|
||||
FACT_ID NUMBER(10) default "IPZ19438"."ISEQ$$_303902".nextval generated as identity
|
||||
constraint PK_FACT_ROOM_BOOKING
|
||||
primary key,
|
||||
SOURCE_RB_ID NUMBER(10) not null
|
||||
constraint UQ_FACT_ROOM_BOOKING_SRC
|
||||
unique,
|
||||
HOTEL_KEY NUMBER(10) not null
|
||||
constraint FK_FACT_HOTEL
|
||||
references DIM_HOTEL,
|
||||
ROOM_KEY NUMBER(10) not null
|
||||
constraint FK_FACT_ROOM
|
||||
references DIM_ROOM,
|
||||
GUEST_KEY NUMBER(10) not null
|
||||
constraint FK_FACT_GUEST
|
||||
references DIM_GUEST,
|
||||
BOOKING_CREATED_DATE_KEY NUMBER(8) not null
|
||||
constraint FK_FACT_BOOKING_DATE
|
||||
references DIM_DATE,
|
||||
CHECKIN_DATE_KEY NUMBER(8) not null
|
||||
constraint FK_FACT_CHECKIN_DATE
|
||||
references DIM_DATE,
|
||||
CHECKOUT_DATE_KEY NUMBER(8) not null
|
||||
constraint FK_FACT_CHECKOUT_DATE
|
||||
references DIM_DATE,
|
||||
BOOKING_STATUS VARCHAR2(20) not null,
|
||||
BOOKING_COUNT NUMBER(1) default 1 not null
|
||||
constraint CK_FACT_BOOKING_COUNT
|
||||
check (booking_count = 1),
|
||||
NIGHTS_STAYED NUMBER(4) not null,
|
||||
NIGHTLY_RATE NUMBER(10, 2) not null,
|
||||
TOTAL_AMOUNT NUMBER(12, 2) not null
|
||||
)
|
||||
/
|
||||
|
||||
create table STG_GUEST
|
||||
(
|
||||
SOURCE_GUEST_ID NUMBER(10) not null,
|
||||
GUEST_NAME VARCHAR2(150) not null,
|
||||
CITY VARCHAR2(100),
|
||||
COUNTRY_CODE CHAR(2),
|
||||
COUNTRY_NAME VARCHAR2(100)
|
||||
)
|
||||
/
|
||||
|
||||
create table STG_ROOM
|
||||
(
|
||||
SOURCE_ROOM_ID NUMBER(10) not null,
|
||||
HOTEL_CODE VARCHAR2(20) not null,
|
||||
ROOM_NUMBER VARCHAR2(10) not null,
|
||||
FLOOR NUMBER(3) not null,
|
||||
ROOM_TYPE_CODE VARCHAR2(20) not null,
|
||||
ROOM_TYPE_DESCRIPTION VARCHAR2(100) not null,
|
||||
SMOKING_YN NUMBER(1) not null,
|
||||
STANDARD_RATE NUMBER(10, 2) not null,
|
||||
HOTEL_ID NUMBER(10)
|
||||
)
|
||||
/
|
||||
|
||||
create table STG_ROOM_BOOKING
|
||||
(
|
||||
SOURCE_RB_ID NUMBER(10) not null,
|
||||
GUEST_ID NUMBER(10) not null,
|
||||
BOOKING_CREATED_DATE DATE not null,
|
||||
CHECKIN_DATE DATE not null,
|
||||
CHECKOUT_DATE DATE not null,
|
||||
BOOKING_STATUS VARCHAR2(20) not null,
|
||||
BOOKING_COUNT NUMBER(1) default 1 not null,
|
||||
NIGHTS_STAYED NUMBER(4) not null,
|
||||
NIGHTLY_RATE NUMBER(10, 2) not null,
|
||||
TOTAL_AMOUNT NUMBER(12, 2) not null,
|
||||
HOTEL_ID NUMBER(10) not null,
|
||||
ROOM_ID NUMBER(10) not null
|
||||
)
|
||||
/
|
||||
|
||||
154
sql/schema.sql
Normal file
154
sql/schema.sql
Normal file
@@ -0,0 +1,154 @@
|
||||
CREATE DATABASE IF NOT EXISTS hotel_reservations
|
||||
CHARACTER SET utf8mb4
|
||||
COLLATE utf8mb4_unicode_ci;
|
||||
|
||||
USE hotel_reservations;
|
||||
|
||||
-- ─────────────────────────────────────────────────────────────────────────────
|
||||
-- LOOKUP / REFERENCE TABLES
|
||||
-- ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
CREATE TABLE hotel_chain (
|
||||
hotel_chain_id INT UNSIGNED NOT NULL AUTO_INCREMENT,
|
||||
code VARCHAR(10) NOT NULL,
|
||||
name VARCHAR(100) NOT NULL,
|
||||
PRIMARY KEY (hotel_chain_id),
|
||||
UNIQUE KEY uq_chain_code (code)
|
||||
);
|
||||
|
||||
CREATE TABLE country (
|
||||
country_id INT UNSIGNED NOT NULL AUTO_INCREMENT,
|
||||
code CHAR(2) NOT NULL,
|
||||
name VARCHAR(100) NOT NULL,
|
||||
currency VARCHAR(10) NOT NULL,
|
||||
PRIMARY KEY (country_id),
|
||||
UNIQUE KEY uq_country_code (code)
|
||||
);
|
||||
|
||||
CREATE TABLE star_rating (
|
||||
star_rating_id INT UNSIGNED NOT NULL AUTO_INCREMENT,
|
||||
code TINYINT UNSIGNED NOT NULL,
|
||||
description VARCHAR(20) NOT NULL,
|
||||
PRIMARY KEY (star_rating_id),
|
||||
UNIQUE KEY uq_star_code (code)
|
||||
);
|
||||
|
||||
CREATE TABLE hotel_characteristic (
|
||||
characteristic_id INT UNSIGNED NOT NULL AUTO_INCREMENT,
|
||||
code VARCHAR(20) NOT NULL,
|
||||
description VARCHAR(100) NOT NULL,
|
||||
PRIMARY KEY (characteristic_id),
|
||||
UNIQUE KEY uq_char_code (code)
|
||||
);
|
||||
|
||||
CREATE TABLE room_type (
|
||||
room_type_id INT UNSIGNED NOT NULL AUTO_INCREMENT,
|
||||
code VARCHAR(20) NOT NULL,
|
||||
description VARCHAR(100) NOT NULL,
|
||||
standard_rate DECIMAL(10,2) NOT NULL,
|
||||
smoking_yn BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
PRIMARY KEY (room_type_id),
|
||||
UNIQUE KEY uq_room_type_code (code)
|
||||
);
|
||||
|
||||
CREATE TABLE rate_period (
|
||||
rate_period_id INT UNSIGNED NOT NULL AUTO_INCREMENT,
|
||||
code VARCHAR(20) NOT NULL,
|
||||
description VARCHAR(50) NOT NULL,
|
||||
month_from TINYINT UNSIGNED NOT NULL,
|
||||
month_to TINYINT UNSIGNED NOT NULL,
|
||||
PRIMARY KEY (rate_period_id),
|
||||
UNIQUE KEY uq_rate_period_code (code)
|
||||
);
|
||||
|
||||
-- ─────────────────────────────────────────────────────────────────────────────
|
||||
-- CORE ENTITIES
|
||||
-- ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
CREATE TABLE hotel (
|
||||
hotel_id INT UNSIGNED NOT NULL AUTO_INCREMENT,
|
||||
hotel_chain_id INT UNSIGNED,
|
||||
country_id INT UNSIGNED NOT NULL,
|
||||
star_rating_id INT UNSIGNED NOT NULL,
|
||||
code VARCHAR(20) NOT NULL,
|
||||
name VARCHAR(150) NOT NULL,
|
||||
address VARCHAR(200),
|
||||
postcode VARCHAR(20),
|
||||
city VARCHAR(100) NOT NULL,
|
||||
url VARCHAR(200),
|
||||
PRIMARY KEY (hotel_id),
|
||||
UNIQUE KEY uq_hotel_code (code),
|
||||
CONSTRAINT fk_hotel_chain FOREIGN KEY (hotel_chain_id) REFERENCES hotel_chain (hotel_chain_id),
|
||||
CONSTRAINT fk_hotel_country FOREIGN KEY (country_id) REFERENCES country (country_id),
|
||||
CONSTRAINT fk_hotel_star FOREIGN KEY (star_rating_id) REFERENCES star_rating (star_rating_id)
|
||||
);
|
||||
|
||||
CREATE TABLE hotel_room (
|
||||
room_id INT UNSIGNED NOT NULL AUTO_INCREMENT,
|
||||
hotel_id INT UNSIGNED NOT NULL,
|
||||
room_type_id INT UNSIGNED NOT NULL,
|
||||
room_number VARCHAR(10) NOT NULL,
|
||||
floor TINYINT UNSIGNED NOT NULL DEFAULT 0,
|
||||
PRIMARY KEY (room_id),
|
||||
UNIQUE KEY uq_hotel_room (hotel_id, room_number),
|
||||
CONSTRAINT fk_room_hotel FOREIGN KEY (hotel_id) REFERENCES hotel (hotel_id),
|
||||
CONSTRAINT fk_room_type FOREIGN KEY (room_type_id) REFERENCES room_type (room_type_id)
|
||||
);
|
||||
|
||||
CREATE TABLE guest (
|
||||
guest_id INT UNSIGNED NOT NULL AUTO_INCREMENT,
|
||||
country_id INT UNSIGNED,
|
||||
name VARCHAR(150) NOT NULL,
|
||||
email VARCHAR(150),
|
||||
address VARCHAR(200),
|
||||
city VARCHAR(100),
|
||||
PRIMARY KEY (guest_id),
|
||||
CONSTRAINT fk_guest_country FOREIGN KEY (country_id) REFERENCES country (country_id)
|
||||
);
|
||||
|
||||
CREATE TABLE booking (
|
||||
booking_id INT UNSIGNED NOT NULL AUTO_INCREMENT,
|
||||
guest_id INT UNSIGNED NOT NULL,
|
||||
hotel_id INT UNSIGNED NOT NULL,
|
||||
date_from DATE NOT NULL,
|
||||
date_to DATE NOT NULL,
|
||||
status ENUM('confirmed', 'cancelled', 'completed', 'no_show') NOT NULL DEFAULT 'confirmed',
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
PRIMARY KEY (booking_id),
|
||||
CONSTRAINT fk_booking_guest FOREIGN KEY (guest_id) REFERENCES guest (guest_id),
|
||||
CONSTRAINT fk_booking_hotel FOREIGN KEY (hotel_id) REFERENCES hotel (hotel_id)
|
||||
);
|
||||
|
||||
CREATE TABLE room_booking (
|
||||
room_booking_id INT UNSIGNED NOT NULL AUTO_INCREMENT,
|
||||
booking_id INT UNSIGNED NOT NULL,
|
||||
room_id INT UNSIGNED NOT NULL,
|
||||
date_from DATE NOT NULL,
|
||||
date_to DATE NOT NULL,
|
||||
nightly_rate DECIMAL(10,2) NOT NULL,
|
||||
total_amount DECIMAL(10,2) NOT NULL,
|
||||
PRIMARY KEY (room_booking_id),
|
||||
CONSTRAINT fk_rb_booking FOREIGN KEY (booking_id) REFERENCES booking (booking_id),
|
||||
CONSTRAINT fk_rb_room FOREIGN KEY (room_id) REFERENCES hotel_room (room_id)
|
||||
);
|
||||
|
||||
-- ─────────────────────────────────────────────────────────────────────────────
|
||||
-- JUNCTION TABLES
|
||||
-- ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
CREATE TABLE hotel_hotel_characteristic (
|
||||
hotel_id INT UNSIGNED NOT NULL,
|
||||
characteristic_id INT UNSIGNED NOT NULL,
|
||||
PRIMARY KEY (hotel_id, characteristic_id),
|
||||
CONSTRAINT fk_hhc_hotel FOREIGN KEY (hotel_id) REFERENCES hotel (hotel_id),
|
||||
CONSTRAINT fk_hhc_char FOREIGN KEY (characteristic_id) REFERENCES hotel_characteristic (characteristic_id)
|
||||
);
|
||||
|
||||
CREATE TABLE period_room_rate (
|
||||
room_type_id INT UNSIGNED NOT NULL,
|
||||
rate_period_id INT UNSIGNED NOT NULL,
|
||||
rate DECIMAL(10,2) NOT NULL,
|
||||
PRIMARY KEY (room_type_id, rate_period_id),
|
||||
CONSTRAINT fk_prr_type FOREIGN KEY (room_type_id) REFERENCES room_type (room_type_id),
|
||||
CONSTRAINT fk_prr_period FOREIGN KEY (rate_period_id) REFERENCES rate_period (rate_period_id)
|
||||
);
|
||||
Reference in New Issue
Block a user