Add initial work from Codex

This commit is contained in:
2026-03-20 15:13:33 +01:00
parent 19771ddd37
commit adb5c1a439
48 changed files with 7054 additions and 16 deletions

1
backend/app/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""Backend application package."""

135
backend/app/core/config.py Normal file
View File

@@ -0,0 +1,135 @@
from __future__ import annotations
from functools import lru_cache
from urllib.parse import quote_plus
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
extra="ignore",
)
app_name: str = "otel-bi-backend"
app_env: str = "dev"
log_level: str = "INFO"
api_host: str = "0.0.0.0"
api_port: int = 8000
cors_origins: str = "http://localhost:5173"
request_timeout_seconds: float = 20.0
mssql_host: str = "localhost"
mssql_port: int = 1433
mssql_username: str = "sa"
mssql_password: str = "Password!123"
mssql_driver: str = "ODBC Driver 18 for SQL Server"
mssql_trust_server_certificate: bool = False
wwi_database: str = "WorldWideImporters"
aw_database: str = "AdventureWorks2022DWH"
wwi_connection_string: str | None = None
aw_connection_string: str | None = None
postgres_host: str = "localhost"
postgres_port: int = 5432
postgres_database: str = "otel_bi_app"
postgres_username: str = "otel_bi_app"
postgres_password: str = "otel_bi_app"
postgres_sslmode: str = "require"
postgres_connection_string: str | None = None
postgres_required: bool = True
query_service_url: str = "http://localhost:8101"
analytics_service_url: str = "http://localhost:8102"
persistence_service_url: str = "http://localhost:8103"
require_frontend_auth: bool = True
frontend_jwt_issuer_url: str = ""
frontend_jwt_audience: str = ""
frontend_jwt_jwks_url: str | None = None
frontend_jwt_algorithm: str = "RS256"
frontend_required_scopes: str = ""
frontend_clock_skew_seconds: int = Field(default=30, ge=0, le=300)
internal_service_auth_enabled: bool = True
internal_service_shared_secret: str = "change-me"
internal_service_token_ttl_seconds: int = Field(default=120, ge=30, le=900)
internal_service_token_audience: str = "bi-internal"
internal_service_allowed_issuers: str = "api-gateway"
internal_token_clock_skew_seconds: int = Field(default=15, ge=0, le=120)
otel_service_name: str = "otel-bi-backend"
otel_service_namespace: str = "final-thesis"
otel_collector_endpoint: str = "http://localhost:4318"
otel_export_timeout_ms: int = 10000
forecast_horizon_days: int = Field(default=30, ge=7, le=180)
default_history_days: int = Field(default=365, ge=30, le=1460)
ranking_default_top_n: int = Field(default=10, ge=3, le=100)
storage_default_limit: int = Field(default=50, ge=10, le=500)
@property
def cors_origins_list(self) -> list[str]:
return [
origin.strip() for origin in self.cors_origins.split(",") if origin.strip()
]
@property
def frontend_required_scopes_list(self) -> list[str]:
return [
scope.strip()
for scope in self.frontend_required_scopes.split(" ")
if scope.strip()
]
@property
def internal_service_allowed_issuers_list(self) -> list[str]:
return [
issuer.strip()
for issuer in self.internal_service_allowed_issuers.split(",")
if issuer.strip()
]
def _build_mssql_connection_url(self, database: str) -> str:
driver = quote_plus(self.mssql_driver)
user = quote_plus(self.mssql_username)
password = quote_plus(self.mssql_password)
trust_cert = "yes" if self.mssql_trust_server_certificate else "no"
return (
f"mssql+pyodbc://{user}:{password}@{self.mssql_host}:{self.mssql_port}/{database}"
f"?driver={driver}&TrustServerCertificate={trust_cert}&ApplicationIntent=ReadOnly"
)
@property
def wwi_connection_url(self) -> str:
return self.wwi_connection_string or self._build_mssql_connection_url(
self.wwi_database
)
@property
def aw_connection_url(self) -> str:
return self.aw_connection_string or self._build_mssql_connection_url(
self.aw_database
)
@property
def postgres_connection_url(self) -> str:
if self.postgres_connection_string:
return self.postgres_connection_string
user = quote_plus(self.postgres_username)
password = quote_plus(self.postgres_password)
return (
f"postgresql+psycopg://{user}:{password}@{self.postgres_host}:{self.postgres_port}/"
f"{self.postgres_database}?sslmode={self.postgres_sslmode}"
)
@lru_cache
def get_settings() -> Settings:
return Settings()
settings = get_settings()

103
backend/app/core/otel.py Normal file
View File

@@ -0,0 +1,103 @@
from __future__ import annotations
import logging
from dataclasses import dataclass
from typing import Any
from fastapi import FastAPI
from opentelemetry import metrics, trace
from opentelemetry.baggage.propagation import W3CBaggagePropagator
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.logging import LoggingInstrumentor
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
from opentelemetry.propagate import set_global_textmap
from opentelemetry.propagators.composite import CompositePropagator
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
try:
from opentelemetry.instrumentation.system_metrics import SystemMetricsInstrumentor
except ImportError: # pragma: no cover - defensive fallback for minimal envs
SystemMetricsInstrumentor = None # type: ignore[assignment]
from app.core.config import Settings
LOGGER = logging.getLogger(__name__)
@dataclass
class TelemetryProviders:
tracer_provider: TracerProvider
meter_provider: MeterProvider
def configure_otel(settings: Settings) -> TelemetryProviders:
set_global_textmap(
CompositePropagator([TraceContextTextMapPropagator(), W3CBaggagePropagator()])
)
resource = Resource.create(
{
"service.name": settings.otel_service_name,
"service.namespace": settings.otel_service_namespace,
"deployment.environment": settings.app_env,
}
)
trace_exporter = OTLPSpanExporter(
endpoint=f"{settings.otel_collector_endpoint}/v1/traces",
timeout=settings.otel_export_timeout_ms / 1000,
)
tracer_provider = TracerProvider(resource=resource)
tracer_provider.add_span_processor(BatchSpanProcessor(trace_exporter))
trace.set_tracer_provider(tracer_provider)
metric_reader = PeriodicExportingMetricReader(
exporter=OTLPMetricExporter(
endpoint=f"{settings.otel_collector_endpoint}/v1/metrics",
timeout=settings.otel_export_timeout_ms / 1000,
),
export_interval_millis=10000,
)
meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader])
metrics.set_meter_provider(meter_provider)
LoggingInstrumentor().instrument(set_logging_format=True)
if SystemMetricsInstrumentor is not None:
SystemMetricsInstrumentor().instrument()
else:
LOGGER.warning(
"System metrics instrumentor not available, runtime host metrics disabled."
)
LOGGER.info("OpenTelemetry providers configured")
return TelemetryProviders(
tracer_provider=tracer_provider, meter_provider=meter_provider
)
def instrument_fastapi(app: FastAPI) -> None:
FastAPIInstrumentor.instrument_app(app)
def instrument_sqlalchemy_engines(engines: dict[str, Any]) -> None:
for engine in engines.values():
SQLAlchemyInstrumentor().instrument(engine=engine)
def instrument_httpx_clients() -> None:
HTTPXClientInstrumentor().instrument()
def shutdown_otel(providers: TelemetryProviders) -> None:
HTTPXClientInstrumentor().uninstrument()
if SystemMetricsInstrumentor is not None:
SystemMetricsInstrumentor().uninstrument()
LoggingInstrumentor().uninstrument()
providers.meter_provider.shutdown()
providers.tracer_provider.shutdown()

View File

@@ -0,0 +1,231 @@
from __future__ import annotations
from dataclasses import dataclass
from functools import lru_cache
from time import time
from uuid import uuid4
import jwt
from fastapi import Depends, Header, HTTPException, status
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from jwt import InvalidTokenError, PyJWKClient
from app.core.config import settings
BEARER_SCHEME = HTTPBearer(auto_error=False)
@dataclass
class FrontendPrincipal:
subject: str
scopes: list[str]
claims: dict
token: str
@dataclass
class InternalPrincipal:
subject: str
scopes: list[str]
claims: dict
token: str
class FrontendJWTVerifier:
@property
def jwks_url(self) -> str:
if not settings.frontend_jwt_jwks_url:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="FRONTEND_JWT_JWKS_URL is not configured.",
)
return settings.frontend_jwt_jwks_url
@lru_cache(maxsize=1)
def _jwks_client(self) -> PyJWKClient:
return PyJWKClient(self.jwks_url)
@staticmethod
def _extract_scopes(claims: dict) -> list[str]:
scope = claims.get("scope")
if isinstance(scope, str):
return [item for item in scope.split(" ") if item]
scp = claims.get("scp")
if isinstance(scp, list):
return [str(item) for item in scp]
return []
def verify(self, token: str) -> FrontendPrincipal:
if not settings.frontend_jwt_issuer_url:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="FRONTEND_JWT_ISSUER_URL is not configured.",
)
if not settings.frontend_jwt_audience:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="FRONTEND_JWT_AUDIENCE is not configured.",
)
try:
signing_key = self._jwks_client().get_signing_key_from_jwt(token).key
claims = jwt.decode(
token,
key=signing_key,
algorithms=[settings.frontend_jwt_algorithm],
audience=settings.frontend_jwt_audience,
issuer=settings.frontend_jwt_issuer_url,
leeway=settings.frontend_clock_skew_seconds,
)
except InvalidTokenError as exc:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid frontend access token.",
) from exc
subject = str(claims.get("sub") or "")
if not subject:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Frontend token missing subject.",
)
scopes = self._extract_scopes(claims)
required = settings.frontend_required_scopes_list
missing = [scope for scope in required if scope not in scopes]
if missing:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail=f"Missing required scope(s): {', '.join(missing)}",
)
return FrontendPrincipal(
subject=subject, scopes=scopes, claims=claims, token=token
)
class InternalTokenManager:
token_type = "internal-service"
@staticmethod
def _assert_secret() -> str:
secret = settings.internal_service_shared_secret
if not secret or secret == "change-me":
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="INTERNAL_SERVICE_SHARED_SECRET must be configured.",
)
if len(secret.encode("utf-8")) < 32:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=(
"INTERNAL_SERVICE_SHARED_SECRET must be at least 32 bytes for "
"HS256 token signing."
),
)
return secret
def mint(
self,
*,
subject: str,
scopes: list[str],
source_service: str,
) -> str:
now = int(time())
payload = {
"sub": subject,
"scope": " ".join(scopes),
"iss": source_service,
"aud": settings.internal_service_token_audience,
"typ": self.token_type,
"iat": now,
"nbf": now,
"exp": now + settings.internal_service_token_ttl_seconds,
"jti": str(uuid4()),
}
return jwt.encode(payload, self._assert_secret(), algorithm="HS256")
def verify(self, token: str) -> InternalPrincipal:
try:
claims = jwt.decode(
token,
self._assert_secret(),
algorithms=["HS256"],
audience=settings.internal_service_token_audience,
options={
"require": ["sub", "iss", "aud", "exp", "iat", "nbf", "jti", "typ"]
},
leeway=settings.internal_token_clock_skew_seconds,
)
except InvalidTokenError as exc:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid internal service token.",
) from exc
subject = str(claims.get("sub") or "")
if not subject:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Internal token missing subject.",
)
issuer = str(claims.get("iss") or "")
if issuer not in settings.internal_service_allowed_issuers_list:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Internal token issuer is not allowed.",
)
token_type = str(claims.get("typ") or "")
if token_type != self.token_type:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Internal token type is invalid.",
)
scope = claims.get("scope")
scopes = [item for item in str(scope).split(" ") if item] if scope else []
return InternalPrincipal(
subject=subject, scopes=scopes, claims=claims, token=token
)
@lru_cache(maxsize=1)
def get_frontend_verifier() -> FrontendJWTVerifier:
return FrontendJWTVerifier()
@lru_cache(maxsize=1)
def get_internal_token_manager() -> InternalTokenManager:
return InternalTokenManager()
def require_frontend_principal(
credentials: HTTPAuthorizationCredentials | None = Depends(BEARER_SCHEME),
) -> FrontendPrincipal:
if not settings.require_frontend_auth:
return FrontendPrincipal(subject="anonymous", scopes=[], claims={}, token="")
if credentials is None or credentials.scheme.lower() != "bearer":
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Missing bearer token.",
)
return get_frontend_verifier().verify(credentials.credentials)
def require_internal_principal(
internal_token: str | None = Header(default=None, alias="x-internal-service-token"),
) -> InternalPrincipal:
if not settings.internal_service_auth_enabled:
return InternalPrincipal(
subject="internal-unauth", scopes=[], claims={}, token=""
)
if not internal_token:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Missing x-internal-service-token header.",
)
return get_internal_token_manager().verify(internal_token)

View File

@@ -0,0 +1 @@
"""Database helpers for warehouse connections."""

34
backend/app/db/engine.py Normal file
View File

@@ -0,0 +1,34 @@
from __future__ import annotations
from sqlalchemy import create_engine, event
from sqlalchemy.engine import Engine
from app.core.config import settings
def _create_read_only_engine(connection_url: str) -> Engine:
engine = create_engine(
connection_url, pool_pre_ping=True, pool_recycle=3600, future=True
)
@event.listens_for(engine, "connect")
def _on_connect(dbapi_connection, _connection_record) -> None:
cursor = dbapi_connection.cursor()
try:
cursor.execute("SET TRANSACTION ISOLATION LEVEL READ COMMITTED;")
finally:
cursor.close()
return engine
def create_warehouse_engines() -> dict[str, Engine]:
return {
"wwi": _create_read_only_engine(settings.wwi_connection_url),
"aw": _create_read_only_engine(settings.aw_connection_url),
}
def dispose_engines(engines: dict[str, Engine]) -> None:
for engine in engines.values():
engine.dispose()

View File

@@ -0,0 +1,27 @@
from __future__ import annotations
from sqlalchemy import create_engine
from sqlalchemy.engine import Engine
from sqlalchemy.orm import Session, sessionmaker
from app.core.config import settings
from app.db.postgres_models import Base
def create_postgres_engine() -> Engine:
return create_engine(
settings.postgres_connection_url,
pool_pre_ping=True,
pool_recycle=3600,
future=True,
)
def initialize_postgres_schema(engine: Engine) -> None:
Base.metadata.create_all(bind=engine)
def create_postgres_session_factory(engine: Engine) -> sessionmaker[Session]:
return sessionmaker(
bind=engine, autoflush=False, autocommit=False, expire_on_commit=False
)

View File

@@ -0,0 +1,86 @@
from __future__ import annotations
from datetime import datetime, timezone
from uuid import uuid4
from sqlalchemy import JSON, DateTime, Float, Integer, String, Text
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
def _utcnow() -> datetime:
return datetime.now(timezone.utc)
class Base(DeclarativeBase):
pass
class AuditLog(Base):
__tablename__ = "audit_logs"
id: Mapped[str] = mapped_column(
String(36), primary_key=True, default=lambda: str(uuid4())
)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), default=_utcnow, index=True
)
method: Mapped[str] = mapped_column(String(12), index=True)
path: Mapped[str] = mapped_column(String(300), index=True)
query_string: Mapped[str] = mapped_column(String(1000), default="")
status_code: Mapped[int] = mapped_column(Integer, index=True)
duration_ms: Mapped[float] = mapped_column(Float)
trace_id: Mapped[str | None] = mapped_column(String(32), nullable=True, index=True)
span_id: Mapped[str | None] = mapped_column(String(16), nullable=True, index=True)
client_ip: Mapped[str | None] = mapped_column(String(120), nullable=True)
user_agent: Mapped[str | None] = mapped_column(Text, nullable=True)
details: Mapped[dict] = mapped_column(JSON, default=dict)
class ForecastRun(Base):
__tablename__ = "forecast_runs"
id: Mapped[str] = mapped_column(
String(36), primary_key=True, default=lambda: str(uuid4())
)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), default=_utcnow, index=True
)
horizon_days: Mapped[int] = mapped_column(Integer)
point_count: Mapped[int] = mapped_column(Integer)
trigger_source: Mapped[str] = mapped_column(String(64), index=True)
trace_id: Mapped[str | None] = mapped_column(String(32), nullable=True, index=True)
span_id: Mapped[str | None] = mapped_column(String(16), nullable=True, index=True)
payload: Mapped[list[dict]] = mapped_column(JSON, default=list)
class RankingRun(Base):
__tablename__ = "ranking_runs"
id: Mapped[str] = mapped_column(
String(36), primary_key=True, default=lambda: str(uuid4())
)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), default=_utcnow, index=True
)
top_n: Mapped[int] = mapped_column(Integer)
item_count: Mapped[int] = mapped_column(Integer)
trigger_source: Mapped[str] = mapped_column(String(64), index=True)
trace_id: Mapped[str | None] = mapped_column(String(32), nullable=True, index=True)
span_id: Mapped[str | None] = mapped_column(String(16), nullable=True, index=True)
payload: Mapped[list[dict]] = mapped_column(JSON, default=list)
class RecommendationRun(Base):
__tablename__ = "recommendation_runs"
id: Mapped[str] = mapped_column(
String(36), primary_key=True, default=lambda: str(uuid4())
)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), default=_utcnow, index=True
)
item_count: Mapped[int] = mapped_column(Integer)
trigger_source: Mapped[str] = mapped_column(String(64), index=True)
trace_id: Mapped[str | None] = mapped_column(String(32), nullable=True, index=True)
span_id: Mapped[str | None] = mapped_column(String(16), nullable=True, index=True)
payload: Mapped[list[dict]] = mapped_column(JSON, default=list)

167
backend/app/db/queries.py Normal file
View File

@@ -0,0 +1,167 @@
from __future__ import annotations
AW_DAILY_SALES_QUERIES = [
"""
SELECT
CAST(d.FullDateAlternateKey AS date) AS sale_date,
SUM(f.SalesAmount) AS revenue,
SUM(f.TotalProductCost) AS cost,
SUM(f.OrderQuantity) AS quantity,
COUNT_BIG(*) AS orders
FROM dbo.FactInternetSales AS f
INNER JOIN dbo.DimDate AS d ON d.DateKey = f.OrderDateKey
GROUP BY CAST(d.FullDateAlternateKey AS date)
ORDER BY sale_date;
""",
"""
SELECT
CAST(OrderDate AS date) AS sale_date,
SUM(SalesAmount) AS revenue,
SUM(TotalProductCost) AS cost,
SUM(OrderQuantity) AS quantity,
COUNT_BIG(*) AS orders
FROM dbo.FactInternetSales
GROUP BY CAST(OrderDate AS date)
ORDER BY sale_date;
""",
]
WWI_DAILY_SALES_QUERIES = [
"""
SELECT
CAST(i.InvoiceDate AS date) AS sale_date,
SUM(il.ExtendedPrice) AS revenue,
SUM(il.TaxAmount) AS cost,
SUM(il.Quantity) AS quantity,
COUNT_BIG(DISTINCT i.InvoiceID) AS orders
FROM Sales.Invoices AS i
INNER JOIN Sales.InvoiceLines AS il ON il.InvoiceID = i.InvoiceID
GROUP BY CAST(i.InvoiceDate AS date)
ORDER BY sale_date;
""",
"""
SELECT
CAST(i.InvoiceDate AS date) AS sale_date,
SUM(il.UnitPrice * il.Quantity) AS revenue,
CAST(0 AS float) AS cost,
SUM(il.Quantity) AS quantity,
COUNT_BIG(DISTINCT i.InvoiceID) AS orders
FROM Sales.Invoices AS i
INNER JOIN Sales.InvoiceLines AS il ON il.InvoiceID = i.InvoiceID
GROUP BY CAST(i.InvoiceDate AS date)
ORDER BY sale_date;
""",
]
AW_PRODUCT_PERFORMANCE_QUERIES = [
"""
SELECT
p.ProductAlternateKey AS product_id,
p.EnglishProductName AS product_name,
COALESCE(sc.EnglishProductSubcategoryName, 'Unknown') AS category_name,
SUM(f.SalesAmount) AS revenue,
SUM(f.TotalProductCost) AS cost,
SUM(f.OrderQuantity) AS quantity,
COUNT_BIG(*) AS orders
FROM dbo.FactInternetSales AS f
INNER JOIN dbo.DimProduct AS p ON p.ProductKey = f.ProductKey
LEFT JOIN dbo.DimProductSubcategory AS sc ON sc.ProductSubcategoryKey = p.ProductSubcategoryKey
GROUP BY p.ProductAlternateKey, p.EnglishProductName, sc.EnglishProductSubcategoryName
ORDER BY revenue DESC;
""",
"""
SELECT
CAST(ProductKey AS nvarchar(100)) AS product_id,
CAST(ProductKey AS nvarchar(100)) AS product_name,
'Unknown' AS category_name,
SUM(SalesAmount) AS revenue,
SUM(TotalProductCost) AS cost,
SUM(OrderQuantity) AS quantity,
COUNT_BIG(*) AS orders
FROM dbo.FactInternetSales
GROUP BY ProductKey
ORDER BY revenue DESC;
""",
]
WWI_PRODUCT_PERFORMANCE_QUERIES = [
"""
SELECT
CAST(s.StockItemID AS nvarchar(100)) AS product_id,
s.StockItemName AS product_name,
COALESCE(cg.StockGroupName, 'Unknown') AS category_name,
SUM(il.ExtendedPrice) AS revenue,
SUM(il.TaxAmount) AS cost,
SUM(il.Quantity) AS quantity,
COUNT_BIG(*) AS orders
FROM Sales.InvoiceLines AS il
INNER JOIN Warehouse.StockItems AS s ON s.StockItemID = il.StockItemID
LEFT JOIN Warehouse.StockItemStockGroups AS sig ON sig.StockItemID = s.StockItemID
LEFT JOIN Warehouse.StockGroups AS cg ON cg.StockGroupID = sig.StockGroupID
GROUP BY s.StockItemID, s.StockItemName, cg.StockGroupName
ORDER BY revenue DESC;
""",
"""
SELECT
CAST(il.StockItemID AS nvarchar(100)) AS product_id,
CAST(il.StockItemID AS nvarchar(100)) AS product_name,
'Unknown' AS category_name,
SUM(il.UnitPrice * il.Quantity) AS revenue,
CAST(0 AS float) AS cost,
SUM(il.Quantity) AS quantity,
COUNT_BIG(*) AS orders
FROM Sales.InvoiceLines AS il
GROUP BY il.StockItemID
ORDER BY revenue DESC;
""",
]
AW_CUSTOMER_QUERIES = [
"""
SELECT
CAST(c.CustomerAlternateKey AS nvarchar(100)) AS customer_id,
c.FirstName + ' ' + c.LastName AS customer_name,
SUM(f.SalesAmount) AS revenue,
COUNT_BIG(*) AS orders
FROM dbo.FactInternetSales AS f
INNER JOIN dbo.DimCustomer AS c ON c.CustomerKey = f.CustomerKey
GROUP BY c.CustomerAlternateKey, c.FirstName, c.LastName
ORDER BY revenue DESC;
""",
"""
SELECT
CAST(CustomerKey AS nvarchar(100)) AS customer_id,
CAST(CustomerKey AS nvarchar(100)) AS customer_name,
SUM(SalesAmount) AS revenue,
COUNT_BIG(*) AS orders
FROM dbo.FactInternetSales
GROUP BY CustomerKey
ORDER BY revenue DESC;
""",
]
WWI_CUSTOMER_QUERIES = [
"""
SELECT
CAST(c.CustomerID AS nvarchar(100)) AS customer_id,
c.CustomerName AS customer_name,
SUM(il.ExtendedPrice) AS revenue,
COUNT_BIG(DISTINCT i.InvoiceID) AS orders
FROM Sales.Invoices AS i
INNER JOIN Sales.InvoiceLines AS il ON il.InvoiceID = i.InvoiceID
INNER JOIN Sales.Customers AS c ON c.CustomerID = i.CustomerID
GROUP BY c.CustomerID, c.CustomerName
ORDER BY revenue DESC;
""",
"""
SELECT
CAST(i.CustomerID AS nvarchar(100)) AS customer_id,
CAST(i.CustomerID AS nvarchar(100)) AS customer_name,
SUM(il.UnitPrice * il.Quantity) AS revenue,
COUNT_BIG(DISTINCT i.InvoiceID) AS orders
FROM Sales.Invoices AS i
INNER JOIN Sales.InvoiceLines AS il ON il.InvoiceID = i.InvoiceID
GROUP BY i.CustomerID
ORDER BY revenue DESC;
""",
]

View File

@@ -0,0 +1 @@
"""Business logic services."""

View File

@@ -0,0 +1,373 @@
from __future__ import annotations
from dataclasses import dataclass
from datetime import date, timedelta
from math import sqrt
import numpy as np
import pandas as pd
from opentelemetry import trace
from sklearn.linear_model import LinearRegression
from app.core.config import settings
from app.services.persistence_service import PersistenceService
from app.services.warehouse_service import ReadOnlyWarehouseClient
@dataclass
class DashboardSnapshot:
kpis: dict
history: list[dict]
forecasts: list[dict]
rankings: list[dict]
recommendations: list[dict]
class AnalyticsService:
def __init__(
self,
warehouse_client: ReadOnlyWarehouseClient,
persistence_service: PersistenceService | None = None,
) -> None:
self.warehouse_client = warehouse_client
self.persistence_service = persistence_service
self.tracer = trace.get_tracer(__name__)
@staticmethod
def _normalize_frame(df: pd.DataFrame, date_col: str = "sale_date") -> pd.DataFrame:
normalized = df.copy()
normalized[date_col] = pd.to_datetime(normalized[date_col], errors="coerce")
for numeric in ("revenue", "cost", "quantity", "orders"):
if numeric in normalized.columns:
normalized[numeric] = pd.to_numeric(
normalized[numeric], errors="coerce"
).fillna(0.0)
return normalized.dropna(subset=[date_col])
def load_sales_history(self, days_back: int | None = None) -> pd.DataFrame:
with self.tracer.start_as_current_span("analytics.load_sales_history"):
daily_sales = self._normalize_frame(
self.warehouse_client.fetch_daily_sales()
)
days = days_back or settings.default_history_days
min_date = pd.Timestamp(date.today() - timedelta(days=days))
filtered = daily_sales[daily_sales["sale_date"] >= min_date]
return (
filtered.groupby("sale_date", as_index=False)[
["revenue", "cost", "quantity", "orders"]
]
.sum()
.sort_values("sale_date")
)
def get_kpis(self) -> dict:
with self.tracer.start_as_current_span("analytics.kpis"):
sales = self.load_sales_history(days_back=180)
if sales.empty:
return {
"total_revenue": 0.0,
"gross_margin_pct": 0.0,
"total_quantity": 0.0,
"avg_order_value": 0.0,
"records_in_window": 0,
}
total_revenue = float(sales["revenue"].sum())
total_cost = float(sales["cost"].sum())
total_orders = max(float(sales["orders"].sum()), 1.0)
margin_pct = (
((total_revenue - total_cost) / total_revenue * 100)
if total_revenue
else 0.0
)
return {
"total_revenue": round(total_revenue, 2),
"gross_margin_pct": round(margin_pct, 2),
"total_quantity": round(float(sales["quantity"].sum()), 2),
"avg_order_value": round(total_revenue / total_orders, 2),
"records_in_window": int(sales.shape[0]),
}
def get_history_points(self, days_back: int | None = None) -> list[dict]:
with self.tracer.start_as_current_span("analytics.history_points"):
sales = self.load_sales_history(days_back=days_back)
if sales.empty:
return []
return [
{
"date": pd.Timestamp(row["sale_date"]).date().isoformat(),
"revenue": round(float(row["revenue"]), 2),
"cost": round(float(row["cost"]), 2),
"quantity": round(float(row["quantity"]), 2),
}
for _, row in sales.iterrows()
]
def get_forecast(
self,
horizon_days: int | None = None,
*,
trigger_source: str = "api.forecasts",
persist: bool = True,
) -> list[dict]:
with self.tracer.start_as_current_span("analytics.forecast"):
horizon = horizon_days or settings.forecast_horizon_days
sales = self.load_sales_history(days_back=720)
if sales.empty:
return []
series = (
sales.set_index("sale_date")["revenue"]
.sort_index()
.resample("D")
.sum()
.fillna(0.0)
)
y = series.values
x = np.arange(len(y), dtype=float).reshape(-1, 1)
model = LinearRegression()
model.fit(x, y)
baseline = model.predict(x)
residual = y - baseline
sigma = float(np.std(residual)) if len(residual) > 1 else 0.0
weekday_baseline = series.groupby(series.index.weekday).mean()
overall_mean = float(series.mean()) if len(series) else 0.0
weekday_factor = (
weekday_baseline / overall_mean
if overall_mean > 0
else pd.Series([1.0] * 7, index=range(7))
)
weekday_factor = weekday_factor.replace([np.inf, -np.inf], 1.0).fillna(1.0)
future_x = np.arange(len(y), len(y) + horizon, dtype=float).reshape(-1, 1)
raw_forecast = model.predict(future_x)
predictions: list[dict] = []
start_date = series.index.max().date()
for idx, point in enumerate(raw_forecast, start=1):
day = start_date + timedelta(days=idx)
factor = (
float(weekday_factor.loc[day.weekday()])
if day.weekday() in weekday_factor.index
else 1.0
)
yhat = max(float(point) * factor, 0.0)
ci = 1.96 * sigma * sqrt(1 + idx / max(len(y), 1))
predictions.append(
{
"date": day.isoformat(),
"predicted_revenue": round(yhat, 2),
"lower_bound": round(max(yhat - ci, 0.0), 2),
"upper_bound": round(yhat + ci, 2),
}
)
if persist and self.persistence_service is not None:
span_context = trace.get_current_span().get_span_context()
trace_id = (
f"{span_context.trace_id:032x}" if span_context.is_valid else None
)
span_id = (
f"{span_context.span_id:016x}" if span_context.is_valid else None
)
self.persistence_service.record_forecast_run(
horizon_days=horizon,
payload=predictions,
trigger_source=trigger_source,
trace_id=trace_id,
span_id=span_id,
)
return predictions
def get_rankings(
self,
top_n: int | None = None,
*,
trigger_source: str = "api.rankings",
persist: bool = True,
) -> list[dict]:
with self.tracer.start_as_current_span("analytics.rankings"):
n = top_n or settings.ranking_default_top_n
products = self.warehouse_client.fetch_product_performance().copy()
if products.empty:
return []
products["revenue"] = pd.to_numeric(
products["revenue"], errors="coerce"
).fillna(0.0)
products["cost"] = pd.to_numeric(products["cost"], errors="coerce").fillna(
0.0
)
products["quantity"] = pd.to_numeric(
products["quantity"], errors="coerce"
).fillna(0.0)
products["orders"] = pd.to_numeric(
products["orders"], errors="coerce"
).fillna(0.0)
grouped = (
products.groupby(
["product_id", "product_name", "category_name"], as_index=False
)[["revenue", "cost", "quantity", "orders"]]
.sum()
.sort_values("revenue", ascending=False)
)
grouped["margin_pct"] = np.where(
grouped["revenue"] > 0,
((grouped["revenue"] - grouped["cost"]) / grouped["revenue"]) * 100,
0.0,
)
revenue_norm = grouped["revenue"] / max(
float(grouped["revenue"].max()), 1.0
)
margin_norm = (grouped["margin_pct"] + 100) / 200
velocity_norm = grouped["quantity"] / max(
float(grouped["quantity"].max()), 1.0
)
grouped["score"] = (
(0.55 * revenue_norm)
+ (0.30 * margin_norm.clip(0, 1))
+ (0.15 * velocity_norm)
)
ranked = (
grouped.sort_values("score", ascending=False)
.head(n)
.reset_index(drop=True)
)
result = [
{
"rank": int(idx + 1),
"product_id": str(row["product_id"]),
"product_name": str(row["product_name"]),
"category": str(row["category_name"]),
"revenue": round(float(row["revenue"]), 2),
"margin_pct": round(float(row["margin_pct"]), 2),
"score": round(float(row["score"]) * 100, 2),
}
for idx, row in ranked.iterrows()
]
if persist and self.persistence_service is not None:
span_context = trace.get_current_span().get_span_context()
trace_id = (
f"{span_context.trace_id:032x}" if span_context.is_valid else None
)
span_id = (
f"{span_context.span_id:016x}" if span_context.is_valid else None
)
self.persistence_service.record_ranking_run(
top_n=n,
payload=result,
trigger_source=trigger_source,
trace_id=trace_id,
span_id=span_id,
)
return result
def get_recommendations(
self,
rankings: list[dict] | None = None,
*,
trigger_source: str = "api.recommendations",
persist: bool = True,
) -> list[dict]:
with self.tracer.start_as_current_span("analytics.recommendations"):
ranking_rows = (
rankings
if rankings is not None
else self.get_rankings(
top_n=20, trigger_source=trigger_source, persist=persist
)
)
customers = self.warehouse_client.fetch_customer_performance().copy()
if customers.empty:
customers = pd.DataFrame(columns=["customer_name", "revenue", "orders"])
recommendations: list[dict] = []
if ranking_rows:
champion = ranking_rows[0]
recommendations.append(
{
"title": "Double down on champion SKU",
"priority": "high",
"summary": (
f"Promote '{champion['product_name']}' with score {champion['score']:.2f} "
f"and margin {champion['margin_pct']:.2f}%."
),
}
)
low_margin = next(
(row for row in ranking_rows if row["margin_pct"] < 10), None
)
if low_margin:
recommendations.append(
{
"title": "Review pricing for low-margin bestseller",
"priority": "medium",
"summary": (
f"'{low_margin['product_name']}' has strong rank but only "
f"{low_margin['margin_pct']:.2f}% margin."
),
}
)
if not customers.empty:
customers["revenue"] = pd.to_numeric(
customers["revenue"], errors="coerce"
).fillna(0.0)
customers["orders"] = pd.to_numeric(
customers["orders"], errors="coerce"
).fillna(0.0)
customer = customers.sort_values("revenue", ascending=False).iloc[0]
recommendations.append(
{
"title": "Protect top customer relationship",
"priority": "high",
"summary": (
f"Prioritize retention for '{customer['customer_name']}' with "
f"{float(customer['orders']):.0f} orders and {float(customer['revenue']):.2f} revenue."
),
}
)
result = recommendations[:5]
if persist and self.persistence_service is not None:
span_context = trace.get_current_span().get_span_context()
trace_id = (
f"{span_context.trace_id:032x}" if span_context.is_valid else None
)
span_id = (
f"{span_context.span_id:016x}" if span_context.is_valid else None
)
self.persistence_service.record_recommendation_run(
payload=result,
trigger_source=trigger_source,
trace_id=trace_id,
span_id=span_id,
)
return result
def get_dashboard(self) -> DashboardSnapshot:
with self.tracer.start_as_current_span("analytics.dashboard"):
rankings = self.get_rankings(trigger_source="api.dashboard", persist=True)
return DashboardSnapshot(
kpis=self.get_kpis(),
history=self.get_history_points(),
forecasts=self.get_forecast(
trigger_source="api.dashboard", persist=True
),
rankings=rankings,
recommendations=self.get_recommendations(
rankings=rankings,
trigger_source="api.dashboard",
persist=True,
),
)

View File

@@ -0,0 +1,281 @@
from __future__ import annotations
import logging
from time import perf_counter
from opentelemetry import metrics, trace
from sqlalchemy import desc, select
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.orm import Session, sessionmaker
from app.db.postgres_models import AuditLog, ForecastRun, RankingRun, RecommendationRun
LOGGER = logging.getLogger(__name__)
class PersistenceService:
def __init__(self, session_factory: sessionmaker[Session]) -> None:
self.session_factory = session_factory
self.tracer = trace.get_tracer(__name__)
self.meter = metrics.get_meter(__name__)
self.write_counter = self.meter.create_counter(
name="postgres_persist_writes_total",
description="Total writes to app persistence PostgreSQL",
)
self.write_latency = self.meter.create_histogram(
name="postgres_persist_write_latency_ms",
unit="ms",
description="Latency of app persistence write operations",
)
@staticmethod
def _to_audit_dict(row: AuditLog) -> dict:
return {
"id": row.id,
"created_at": row.created_at.isoformat(),
"method": row.method,
"path": row.path,
"query_string": row.query_string,
"status_code": row.status_code,
"duration_ms": row.duration_ms,
"trace_id": row.trace_id,
"span_id": row.span_id,
"client_ip": row.client_ip,
"user_agent": row.user_agent,
"details": row.details,
}
@staticmethod
def _to_forecast_dict(row: ForecastRun) -> dict:
return {
"id": row.id,
"created_at": row.created_at.isoformat(),
"horizon_days": row.horizon_days,
"point_count": row.point_count,
"trigger_source": row.trigger_source,
"trace_id": row.trace_id,
"span_id": row.span_id,
"payload": row.payload,
}
@staticmethod
def _to_ranking_dict(row: RankingRun) -> dict:
return {
"id": row.id,
"created_at": row.created_at.isoformat(),
"top_n": row.top_n,
"item_count": row.item_count,
"trigger_source": row.trigger_source,
"trace_id": row.trace_id,
"span_id": row.span_id,
"payload": row.payload,
}
@staticmethod
def _to_recommendation_dict(row: RecommendationRun) -> dict:
return {
"id": row.id,
"created_at": row.created_at.isoformat(),
"item_count": row.item_count,
"trigger_source": row.trigger_source,
"trace_id": row.trace_id,
"span_id": row.span_id,
"payload": row.payload,
}
def record_audit_log(
self,
*,
method: str,
path: str,
query_string: str,
status_code: int,
duration_ms: float,
trace_id: str | None,
span_id: str | None,
client_ip: str | None,
user_agent: str | None,
details: dict | None = None,
) -> None:
started = perf_counter()
with self.tracer.start_as_current_span("persist.audit_log"):
try:
with self.session_factory() as session:
session.add(
AuditLog(
method=method,
path=path,
query_string=query_string[:1000],
status_code=status_code,
duration_ms=duration_ms,
trace_id=trace_id,
span_id=span_id,
client_ip=client_ip,
user_agent=user_agent,
details=details or {},
)
)
session.commit()
self.write_counter.add(
1, attributes={"entity": "audit", "status": "ok"}
)
except SQLAlchemyError as exc:
LOGGER.exception("Failed to persist audit log: %s", exc)
self.write_counter.add(
1, attributes={"entity": "audit", "status": "error"}
)
finally:
self.write_latency.record(
(perf_counter() - started) * 1000,
attributes={"entity": "audit"},
)
def record_forecast_run(
self,
*,
horizon_days: int,
payload: list[dict],
trigger_source: str,
trace_id: str | None,
span_id: str | None,
) -> None:
started = perf_counter()
with self.tracer.start_as_current_span("persist.forecast_run"):
try:
with self.session_factory() as session:
session.add(
ForecastRun(
horizon_days=horizon_days,
point_count=len(payload),
trigger_source=trigger_source,
trace_id=trace_id,
span_id=span_id,
payload=payload,
)
)
session.commit()
self.write_counter.add(
1, attributes={"entity": "forecast", "status": "ok"}
)
except SQLAlchemyError as exc:
LOGGER.exception("Failed to persist forecast run: %s", exc)
self.write_counter.add(
1, attributes={"entity": "forecast", "status": "error"}
)
finally:
self.write_latency.record(
(perf_counter() - started) * 1000,
attributes={"entity": "forecast"},
)
def record_ranking_run(
self,
*,
top_n: int,
payload: list[dict],
trigger_source: str,
trace_id: str | None,
span_id: str | None,
) -> None:
started = perf_counter()
with self.tracer.start_as_current_span("persist.ranking_run"):
try:
with self.session_factory() as session:
session.add(
RankingRun(
top_n=top_n,
item_count=len(payload),
trigger_source=trigger_source,
trace_id=trace_id,
span_id=span_id,
payload=payload,
)
)
session.commit()
self.write_counter.add(
1, attributes={"entity": "ranking", "status": "ok"}
)
except SQLAlchemyError as exc:
LOGGER.exception("Failed to persist ranking run: %s", exc)
self.write_counter.add(
1, attributes={"entity": "ranking", "status": "error"}
)
finally:
self.write_latency.record(
(perf_counter() - started) * 1000,
attributes={"entity": "ranking"},
)
def record_recommendation_run(
self,
*,
payload: list[dict],
trigger_source: str,
trace_id: str | None,
span_id: str | None,
) -> None:
started = perf_counter()
with self.tracer.start_as_current_span("persist.recommendation_run"):
try:
with self.session_factory() as session:
session.add(
RecommendationRun(
item_count=len(payload),
trigger_source=trigger_source,
trace_id=trace_id,
span_id=span_id,
payload=payload,
)
)
session.commit()
self.write_counter.add(
1, attributes={"entity": "recommendation", "status": "ok"}
)
except SQLAlchemyError as exc:
LOGGER.exception("Failed to persist recommendation run: %s", exc)
self.write_counter.add(
1, attributes={"entity": "recommendation", "status": "error"}
)
finally:
self.write_latency.record(
(perf_counter() - started) * 1000,
attributes={"entity": "recommendation"},
)
def list_audit_logs(self, limit: int) -> list[dict]:
with self.tracer.start_as_current_span("persist.list_audit_logs"):
with self.session_factory() as session:
rows = session.execute(
select(AuditLog).order_by(desc(AuditLog.created_at)).limit(limit)
).scalars()
return [self._to_audit_dict(row) for row in rows]
def list_forecast_runs(self, limit: int) -> list[dict]:
with self.tracer.start_as_current_span("persist.list_forecast_runs"):
with self.session_factory() as session:
rows = session.execute(
select(ForecastRun)
.order_by(desc(ForecastRun.created_at))
.limit(limit)
).scalars()
return [self._to_forecast_dict(row) for row in rows]
def list_ranking_runs(self, limit: int) -> list[dict]:
with self.tracer.start_as_current_span("persist.list_ranking_runs"):
with self.session_factory() as session:
rows = session.execute(
select(RankingRun)
.order_by(desc(RankingRun.created_at))
.limit(limit)
).scalars()
return [self._to_ranking_dict(row) for row in rows]
def list_recommendation_runs(self, limit: int) -> list[dict]:
with self.tracer.start_as_current_span("persist.list_recommendation_runs"):
with self.session_factory() as session:
rows = session.execute(
select(RecommendationRun)
.order_by(desc(RecommendationRun.created_at))
.limit(limit)
).scalars()
return [self._to_recommendation_dict(row) for row in rows]

View File

@@ -0,0 +1,101 @@
from __future__ import annotations
import hashlib
import logging
from collections.abc import Sequence
from time import perf_counter
import pandas as pd
from opentelemetry import metrics, trace
from sqlalchemy import text
from sqlalchemy.engine import Engine
from sqlalchemy.exc import SQLAlchemyError
from app.db import queries
LOGGER = logging.getLogger(__name__)
class ReadOnlyWarehouseClient:
def __init__(self, engines: dict[str, Engine]) -> None:
self.engines = engines
self.tracer = trace.get_tracer(__name__)
self.meter = metrics.get_meter(__name__)
self.query_counter = self.meter.create_counter(
name="warehouse_queries_total",
description="Total warehouse query executions",
)
self.query_latency = self.meter.create_histogram(
name="warehouse_query_latency_ms",
unit="ms",
description="Warehouse query latency",
)
def _validate_read_only_query(self, sql: str) -> None:
normalized = sql.strip().lower()
if not (normalized.startswith("select") or normalized.startswith("with")):
raise ValueError("Only read-only SELECT/CTE SQL statements are allowed.")
def _run_query_list(
self, source: str, sql_candidates: Sequence[str]
) -> pd.DataFrame:
engine = self.engines[source]
last_error: Exception | None = None
for candidate in sql_candidates:
self._validate_read_only_query(candidate)
query_hash = hashlib.sha256(candidate.encode("utf-8")).hexdigest()[:12]
with self.tracer.start_as_current_span("warehouse.query") as span:
span.set_attribute("db.system", "mssql")
span.set_attribute("db.source", source)
span.set_attribute("db.query.hash", query_hash)
started = perf_counter()
try:
with engine.connect() as conn:
with self.tracer.start_as_current_span(
"warehouse.query.execute"
):
df = pd.read_sql_query(sql=text(candidate), con=conn)
elapsed_ms = (perf_counter() - started) * 1000
self.query_latency.record(elapsed_ms, attributes={"source": source})
self.query_counter.add(
1, attributes={"source": source, "status": "ok"}
)
return df
except SQLAlchemyError as exc:
last_error = exc
elapsed_ms = (perf_counter() - started) * 1000
self.query_latency.record(elapsed_ms, attributes={"source": source})
self.query_counter.add(
1, attributes={"source": source, "status": "error"}
)
LOGGER.warning(
"Query failed for %s with hash %s: %s", source, query_hash, exc
)
if last_error is not None:
raise RuntimeError(
f"All query candidates failed for source '{source}'."
) from last_error
return pd.DataFrame()
def fetch_daily_sales(self) -> pd.DataFrame:
aw = self._run_query_list("aw", queries.AW_DAILY_SALES_QUERIES)
aw["source"] = "AdventureWorks2022DWH"
wwi = self._run_query_list("wwi", queries.WWI_DAILY_SALES_QUERIES)
wwi["source"] = "WorldWideImporters"
return pd.concat([aw, wwi], ignore_index=True)
def fetch_product_performance(self) -> pd.DataFrame:
aw = self._run_query_list("aw", queries.AW_PRODUCT_PERFORMANCE_QUERIES)
aw["source"] = "AdventureWorks2022DWH"
wwi = self._run_query_list("wwi", queries.WWI_PRODUCT_PERFORMANCE_QUERIES)
wwi["source"] = "WorldWideImporters"
return pd.concat([aw, wwi], ignore_index=True)
def fetch_customer_performance(self) -> pd.DataFrame:
aw = self._run_query_list("aw", queries.AW_CUSTOMER_QUERIES)
aw["source"] = "AdventureWorks2022DWH"
wwi = self._run_query_list("wwi", queries.WWI_CUSTOMER_QUERIES)
wwi["source"] = "WorldWideImporters"
return pd.concat([aw, wwi], ignore_index=True)