Add initial work from Codex

This commit is contained in:
2026-03-20 15:13:33 +01:00
parent 19771ddd37
commit adb5c1a439
48 changed files with 7054 additions and 16 deletions

View File

@@ -0,0 +1 @@
"""Business logic services."""

View File

@@ -0,0 +1,373 @@
from __future__ import annotations
from dataclasses import dataclass
from datetime import date, timedelta
from math import sqrt
import numpy as np
import pandas as pd
from opentelemetry import trace
from sklearn.linear_model import LinearRegression
from app.core.config import settings
from app.services.persistence_service import PersistenceService
from app.services.warehouse_service import ReadOnlyWarehouseClient
@dataclass
class DashboardSnapshot:
kpis: dict
history: list[dict]
forecasts: list[dict]
rankings: list[dict]
recommendations: list[dict]
class AnalyticsService:
def __init__(
self,
warehouse_client: ReadOnlyWarehouseClient,
persistence_service: PersistenceService | None = None,
) -> None:
self.warehouse_client = warehouse_client
self.persistence_service = persistence_service
self.tracer = trace.get_tracer(__name__)
@staticmethod
def _normalize_frame(df: pd.DataFrame, date_col: str = "sale_date") -> pd.DataFrame:
normalized = df.copy()
normalized[date_col] = pd.to_datetime(normalized[date_col], errors="coerce")
for numeric in ("revenue", "cost", "quantity", "orders"):
if numeric in normalized.columns:
normalized[numeric] = pd.to_numeric(
normalized[numeric], errors="coerce"
).fillna(0.0)
return normalized.dropna(subset=[date_col])
def load_sales_history(self, days_back: int | None = None) -> pd.DataFrame:
with self.tracer.start_as_current_span("analytics.load_sales_history"):
daily_sales = self._normalize_frame(
self.warehouse_client.fetch_daily_sales()
)
days = days_back or settings.default_history_days
min_date = pd.Timestamp(date.today() - timedelta(days=days))
filtered = daily_sales[daily_sales["sale_date"] >= min_date]
return (
filtered.groupby("sale_date", as_index=False)[
["revenue", "cost", "quantity", "orders"]
]
.sum()
.sort_values("sale_date")
)
def get_kpis(self) -> dict:
with self.tracer.start_as_current_span("analytics.kpis"):
sales = self.load_sales_history(days_back=180)
if sales.empty:
return {
"total_revenue": 0.0,
"gross_margin_pct": 0.0,
"total_quantity": 0.0,
"avg_order_value": 0.0,
"records_in_window": 0,
}
total_revenue = float(sales["revenue"].sum())
total_cost = float(sales["cost"].sum())
total_orders = max(float(sales["orders"].sum()), 1.0)
margin_pct = (
((total_revenue - total_cost) / total_revenue * 100)
if total_revenue
else 0.0
)
return {
"total_revenue": round(total_revenue, 2),
"gross_margin_pct": round(margin_pct, 2),
"total_quantity": round(float(sales["quantity"].sum()), 2),
"avg_order_value": round(total_revenue / total_orders, 2),
"records_in_window": int(sales.shape[0]),
}
def get_history_points(self, days_back: int | None = None) -> list[dict]:
with self.tracer.start_as_current_span("analytics.history_points"):
sales = self.load_sales_history(days_back=days_back)
if sales.empty:
return []
return [
{
"date": pd.Timestamp(row["sale_date"]).date().isoformat(),
"revenue": round(float(row["revenue"]), 2),
"cost": round(float(row["cost"]), 2),
"quantity": round(float(row["quantity"]), 2),
}
for _, row in sales.iterrows()
]
def get_forecast(
self,
horizon_days: int | None = None,
*,
trigger_source: str = "api.forecasts",
persist: bool = True,
) -> list[dict]:
with self.tracer.start_as_current_span("analytics.forecast"):
horizon = horizon_days or settings.forecast_horizon_days
sales = self.load_sales_history(days_back=720)
if sales.empty:
return []
series = (
sales.set_index("sale_date")["revenue"]
.sort_index()
.resample("D")
.sum()
.fillna(0.0)
)
y = series.values
x = np.arange(len(y), dtype=float).reshape(-1, 1)
model = LinearRegression()
model.fit(x, y)
baseline = model.predict(x)
residual = y - baseline
sigma = float(np.std(residual)) if len(residual) > 1 else 0.0
weekday_baseline = series.groupby(series.index.weekday).mean()
overall_mean = float(series.mean()) if len(series) else 0.0
weekday_factor = (
weekday_baseline / overall_mean
if overall_mean > 0
else pd.Series([1.0] * 7, index=range(7))
)
weekday_factor = weekday_factor.replace([np.inf, -np.inf], 1.0).fillna(1.0)
future_x = np.arange(len(y), len(y) + horizon, dtype=float).reshape(-1, 1)
raw_forecast = model.predict(future_x)
predictions: list[dict] = []
start_date = series.index.max().date()
for idx, point in enumerate(raw_forecast, start=1):
day = start_date + timedelta(days=idx)
factor = (
float(weekday_factor.loc[day.weekday()])
if day.weekday() in weekday_factor.index
else 1.0
)
yhat = max(float(point) * factor, 0.0)
ci = 1.96 * sigma * sqrt(1 + idx / max(len(y), 1))
predictions.append(
{
"date": day.isoformat(),
"predicted_revenue": round(yhat, 2),
"lower_bound": round(max(yhat - ci, 0.0), 2),
"upper_bound": round(yhat + ci, 2),
}
)
if persist and self.persistence_service is not None:
span_context = trace.get_current_span().get_span_context()
trace_id = (
f"{span_context.trace_id:032x}" if span_context.is_valid else None
)
span_id = (
f"{span_context.span_id:016x}" if span_context.is_valid else None
)
self.persistence_service.record_forecast_run(
horizon_days=horizon,
payload=predictions,
trigger_source=trigger_source,
trace_id=trace_id,
span_id=span_id,
)
return predictions
def get_rankings(
self,
top_n: int | None = None,
*,
trigger_source: str = "api.rankings",
persist: bool = True,
) -> list[dict]:
with self.tracer.start_as_current_span("analytics.rankings"):
n = top_n or settings.ranking_default_top_n
products = self.warehouse_client.fetch_product_performance().copy()
if products.empty:
return []
products["revenue"] = pd.to_numeric(
products["revenue"], errors="coerce"
).fillna(0.0)
products["cost"] = pd.to_numeric(products["cost"], errors="coerce").fillna(
0.0
)
products["quantity"] = pd.to_numeric(
products["quantity"], errors="coerce"
).fillna(0.0)
products["orders"] = pd.to_numeric(
products["orders"], errors="coerce"
).fillna(0.0)
grouped = (
products.groupby(
["product_id", "product_name", "category_name"], as_index=False
)[["revenue", "cost", "quantity", "orders"]]
.sum()
.sort_values("revenue", ascending=False)
)
grouped["margin_pct"] = np.where(
grouped["revenue"] > 0,
((grouped["revenue"] - grouped["cost"]) / grouped["revenue"]) * 100,
0.0,
)
revenue_norm = grouped["revenue"] / max(
float(grouped["revenue"].max()), 1.0
)
margin_norm = (grouped["margin_pct"] + 100) / 200
velocity_norm = grouped["quantity"] / max(
float(grouped["quantity"].max()), 1.0
)
grouped["score"] = (
(0.55 * revenue_norm)
+ (0.30 * margin_norm.clip(0, 1))
+ (0.15 * velocity_norm)
)
ranked = (
grouped.sort_values("score", ascending=False)
.head(n)
.reset_index(drop=True)
)
result = [
{
"rank": int(idx + 1),
"product_id": str(row["product_id"]),
"product_name": str(row["product_name"]),
"category": str(row["category_name"]),
"revenue": round(float(row["revenue"]), 2),
"margin_pct": round(float(row["margin_pct"]), 2),
"score": round(float(row["score"]) * 100, 2),
}
for idx, row in ranked.iterrows()
]
if persist and self.persistence_service is not None:
span_context = trace.get_current_span().get_span_context()
trace_id = (
f"{span_context.trace_id:032x}" if span_context.is_valid else None
)
span_id = (
f"{span_context.span_id:016x}" if span_context.is_valid else None
)
self.persistence_service.record_ranking_run(
top_n=n,
payload=result,
trigger_source=trigger_source,
trace_id=trace_id,
span_id=span_id,
)
return result
def get_recommendations(
self,
rankings: list[dict] | None = None,
*,
trigger_source: str = "api.recommendations",
persist: bool = True,
) -> list[dict]:
with self.tracer.start_as_current_span("analytics.recommendations"):
ranking_rows = (
rankings
if rankings is not None
else self.get_rankings(
top_n=20, trigger_source=trigger_source, persist=persist
)
)
customers = self.warehouse_client.fetch_customer_performance().copy()
if customers.empty:
customers = pd.DataFrame(columns=["customer_name", "revenue", "orders"])
recommendations: list[dict] = []
if ranking_rows:
champion = ranking_rows[0]
recommendations.append(
{
"title": "Double down on champion SKU",
"priority": "high",
"summary": (
f"Promote '{champion['product_name']}' with score {champion['score']:.2f} "
f"and margin {champion['margin_pct']:.2f}%."
),
}
)
low_margin = next(
(row for row in ranking_rows if row["margin_pct"] < 10), None
)
if low_margin:
recommendations.append(
{
"title": "Review pricing for low-margin bestseller",
"priority": "medium",
"summary": (
f"'{low_margin['product_name']}' has strong rank but only "
f"{low_margin['margin_pct']:.2f}% margin."
),
}
)
if not customers.empty:
customers["revenue"] = pd.to_numeric(
customers["revenue"], errors="coerce"
).fillna(0.0)
customers["orders"] = pd.to_numeric(
customers["orders"], errors="coerce"
).fillna(0.0)
customer = customers.sort_values("revenue", ascending=False).iloc[0]
recommendations.append(
{
"title": "Protect top customer relationship",
"priority": "high",
"summary": (
f"Prioritize retention for '{customer['customer_name']}' with "
f"{float(customer['orders']):.0f} orders and {float(customer['revenue']):.2f} revenue."
),
}
)
result = recommendations[:5]
if persist and self.persistence_service is not None:
span_context = trace.get_current_span().get_span_context()
trace_id = (
f"{span_context.trace_id:032x}" if span_context.is_valid else None
)
span_id = (
f"{span_context.span_id:016x}" if span_context.is_valid else None
)
self.persistence_service.record_recommendation_run(
payload=result,
trigger_source=trigger_source,
trace_id=trace_id,
span_id=span_id,
)
return result
def get_dashboard(self) -> DashboardSnapshot:
with self.tracer.start_as_current_span("analytics.dashboard"):
rankings = self.get_rankings(trigger_source="api.dashboard", persist=True)
return DashboardSnapshot(
kpis=self.get_kpis(),
history=self.get_history_points(),
forecasts=self.get_forecast(
trigger_source="api.dashboard", persist=True
),
rankings=rankings,
recommendations=self.get_recommendations(
rankings=rankings,
trigger_source="api.dashboard",
persist=True,
),
)

View File

@@ -0,0 +1,281 @@
from __future__ import annotations
import logging
from time import perf_counter
from opentelemetry import metrics, trace
from sqlalchemy import desc, select
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.orm import Session, sessionmaker
from app.db.postgres_models import AuditLog, ForecastRun, RankingRun, RecommendationRun
LOGGER = logging.getLogger(__name__)
class PersistenceService:
def __init__(self, session_factory: sessionmaker[Session]) -> None:
self.session_factory = session_factory
self.tracer = trace.get_tracer(__name__)
self.meter = metrics.get_meter(__name__)
self.write_counter = self.meter.create_counter(
name="postgres_persist_writes_total",
description="Total writes to app persistence PostgreSQL",
)
self.write_latency = self.meter.create_histogram(
name="postgres_persist_write_latency_ms",
unit="ms",
description="Latency of app persistence write operations",
)
@staticmethod
def _to_audit_dict(row: AuditLog) -> dict:
return {
"id": row.id,
"created_at": row.created_at.isoformat(),
"method": row.method,
"path": row.path,
"query_string": row.query_string,
"status_code": row.status_code,
"duration_ms": row.duration_ms,
"trace_id": row.trace_id,
"span_id": row.span_id,
"client_ip": row.client_ip,
"user_agent": row.user_agent,
"details": row.details,
}
@staticmethod
def _to_forecast_dict(row: ForecastRun) -> dict:
return {
"id": row.id,
"created_at": row.created_at.isoformat(),
"horizon_days": row.horizon_days,
"point_count": row.point_count,
"trigger_source": row.trigger_source,
"trace_id": row.trace_id,
"span_id": row.span_id,
"payload": row.payload,
}
@staticmethod
def _to_ranking_dict(row: RankingRun) -> dict:
return {
"id": row.id,
"created_at": row.created_at.isoformat(),
"top_n": row.top_n,
"item_count": row.item_count,
"trigger_source": row.trigger_source,
"trace_id": row.trace_id,
"span_id": row.span_id,
"payload": row.payload,
}
@staticmethod
def _to_recommendation_dict(row: RecommendationRun) -> dict:
return {
"id": row.id,
"created_at": row.created_at.isoformat(),
"item_count": row.item_count,
"trigger_source": row.trigger_source,
"trace_id": row.trace_id,
"span_id": row.span_id,
"payload": row.payload,
}
def record_audit_log(
self,
*,
method: str,
path: str,
query_string: str,
status_code: int,
duration_ms: float,
trace_id: str | None,
span_id: str | None,
client_ip: str | None,
user_agent: str | None,
details: dict | None = None,
) -> None:
started = perf_counter()
with self.tracer.start_as_current_span("persist.audit_log"):
try:
with self.session_factory() as session:
session.add(
AuditLog(
method=method,
path=path,
query_string=query_string[:1000],
status_code=status_code,
duration_ms=duration_ms,
trace_id=trace_id,
span_id=span_id,
client_ip=client_ip,
user_agent=user_agent,
details=details or {},
)
)
session.commit()
self.write_counter.add(
1, attributes={"entity": "audit", "status": "ok"}
)
except SQLAlchemyError as exc:
LOGGER.exception("Failed to persist audit log: %s", exc)
self.write_counter.add(
1, attributes={"entity": "audit", "status": "error"}
)
finally:
self.write_latency.record(
(perf_counter() - started) * 1000,
attributes={"entity": "audit"},
)
def record_forecast_run(
self,
*,
horizon_days: int,
payload: list[dict],
trigger_source: str,
trace_id: str | None,
span_id: str | None,
) -> None:
started = perf_counter()
with self.tracer.start_as_current_span("persist.forecast_run"):
try:
with self.session_factory() as session:
session.add(
ForecastRun(
horizon_days=horizon_days,
point_count=len(payload),
trigger_source=trigger_source,
trace_id=trace_id,
span_id=span_id,
payload=payload,
)
)
session.commit()
self.write_counter.add(
1, attributes={"entity": "forecast", "status": "ok"}
)
except SQLAlchemyError as exc:
LOGGER.exception("Failed to persist forecast run: %s", exc)
self.write_counter.add(
1, attributes={"entity": "forecast", "status": "error"}
)
finally:
self.write_latency.record(
(perf_counter() - started) * 1000,
attributes={"entity": "forecast"},
)
def record_ranking_run(
self,
*,
top_n: int,
payload: list[dict],
trigger_source: str,
trace_id: str | None,
span_id: str | None,
) -> None:
started = perf_counter()
with self.tracer.start_as_current_span("persist.ranking_run"):
try:
with self.session_factory() as session:
session.add(
RankingRun(
top_n=top_n,
item_count=len(payload),
trigger_source=trigger_source,
trace_id=trace_id,
span_id=span_id,
payload=payload,
)
)
session.commit()
self.write_counter.add(
1, attributes={"entity": "ranking", "status": "ok"}
)
except SQLAlchemyError as exc:
LOGGER.exception("Failed to persist ranking run: %s", exc)
self.write_counter.add(
1, attributes={"entity": "ranking", "status": "error"}
)
finally:
self.write_latency.record(
(perf_counter() - started) * 1000,
attributes={"entity": "ranking"},
)
def record_recommendation_run(
self,
*,
payload: list[dict],
trigger_source: str,
trace_id: str | None,
span_id: str | None,
) -> None:
started = perf_counter()
with self.tracer.start_as_current_span("persist.recommendation_run"):
try:
with self.session_factory() as session:
session.add(
RecommendationRun(
item_count=len(payload),
trigger_source=trigger_source,
trace_id=trace_id,
span_id=span_id,
payload=payload,
)
)
session.commit()
self.write_counter.add(
1, attributes={"entity": "recommendation", "status": "ok"}
)
except SQLAlchemyError as exc:
LOGGER.exception("Failed to persist recommendation run: %s", exc)
self.write_counter.add(
1, attributes={"entity": "recommendation", "status": "error"}
)
finally:
self.write_latency.record(
(perf_counter() - started) * 1000,
attributes={"entity": "recommendation"},
)
def list_audit_logs(self, limit: int) -> list[dict]:
with self.tracer.start_as_current_span("persist.list_audit_logs"):
with self.session_factory() as session:
rows = session.execute(
select(AuditLog).order_by(desc(AuditLog.created_at)).limit(limit)
).scalars()
return [self._to_audit_dict(row) for row in rows]
def list_forecast_runs(self, limit: int) -> list[dict]:
with self.tracer.start_as_current_span("persist.list_forecast_runs"):
with self.session_factory() as session:
rows = session.execute(
select(ForecastRun)
.order_by(desc(ForecastRun.created_at))
.limit(limit)
).scalars()
return [self._to_forecast_dict(row) for row in rows]
def list_ranking_runs(self, limit: int) -> list[dict]:
with self.tracer.start_as_current_span("persist.list_ranking_runs"):
with self.session_factory() as session:
rows = session.execute(
select(RankingRun)
.order_by(desc(RankingRun.created_at))
.limit(limit)
).scalars()
return [self._to_ranking_dict(row) for row in rows]
def list_recommendation_runs(self, limit: int) -> list[dict]:
with self.tracer.start_as_current_span("persist.list_recommendation_runs"):
with self.session_factory() as session:
rows = session.execute(
select(RecommendationRun)
.order_by(desc(RecommendationRun.created_at))
.limit(limit)
).scalars()
return [self._to_recommendation_dict(row) for row in rows]

View File

@@ -0,0 +1,101 @@
from __future__ import annotations
import hashlib
import logging
from collections.abc import Sequence
from time import perf_counter
import pandas as pd
from opentelemetry import metrics, trace
from sqlalchemy import text
from sqlalchemy.engine import Engine
from sqlalchemy.exc import SQLAlchemyError
from app.db import queries
LOGGER = logging.getLogger(__name__)
class ReadOnlyWarehouseClient:
def __init__(self, engines: dict[str, Engine]) -> None:
self.engines = engines
self.tracer = trace.get_tracer(__name__)
self.meter = metrics.get_meter(__name__)
self.query_counter = self.meter.create_counter(
name="warehouse_queries_total",
description="Total warehouse query executions",
)
self.query_latency = self.meter.create_histogram(
name="warehouse_query_latency_ms",
unit="ms",
description="Warehouse query latency",
)
def _validate_read_only_query(self, sql: str) -> None:
normalized = sql.strip().lower()
if not (normalized.startswith("select") or normalized.startswith("with")):
raise ValueError("Only read-only SELECT/CTE SQL statements are allowed.")
def _run_query_list(
self, source: str, sql_candidates: Sequence[str]
) -> pd.DataFrame:
engine = self.engines[source]
last_error: Exception | None = None
for candidate in sql_candidates:
self._validate_read_only_query(candidate)
query_hash = hashlib.sha256(candidate.encode("utf-8")).hexdigest()[:12]
with self.tracer.start_as_current_span("warehouse.query") as span:
span.set_attribute("db.system", "mssql")
span.set_attribute("db.source", source)
span.set_attribute("db.query.hash", query_hash)
started = perf_counter()
try:
with engine.connect() as conn:
with self.tracer.start_as_current_span(
"warehouse.query.execute"
):
df = pd.read_sql_query(sql=text(candidate), con=conn)
elapsed_ms = (perf_counter() - started) * 1000
self.query_latency.record(elapsed_ms, attributes={"source": source})
self.query_counter.add(
1, attributes={"source": source, "status": "ok"}
)
return df
except SQLAlchemyError as exc:
last_error = exc
elapsed_ms = (perf_counter() - started) * 1000
self.query_latency.record(elapsed_ms, attributes={"source": source})
self.query_counter.add(
1, attributes={"source": source, "status": "error"}
)
LOGGER.warning(
"Query failed for %s with hash %s: %s", source, query_hash, exc
)
if last_error is not None:
raise RuntimeError(
f"All query candidates failed for source '{source}'."
) from last_error
return pd.DataFrame()
def fetch_daily_sales(self) -> pd.DataFrame:
aw = self._run_query_list("aw", queries.AW_DAILY_SALES_QUERIES)
aw["source"] = "AdventureWorks2022DWH"
wwi = self._run_query_list("wwi", queries.WWI_DAILY_SALES_QUERIES)
wwi["source"] = "WorldWideImporters"
return pd.concat([aw, wwi], ignore_index=True)
def fetch_product_performance(self) -> pd.DataFrame:
aw = self._run_query_list("aw", queries.AW_PRODUCT_PERFORMANCE_QUERIES)
aw["source"] = "AdventureWorks2022DWH"
wwi = self._run_query_list("wwi", queries.WWI_PRODUCT_PERFORMANCE_QUERIES)
wwi["source"] = "WorldWideImporters"
return pd.concat([aw, wwi], ignore_index=True)
def fetch_customer_performance(self) -> pd.DataFrame:
aw = self._run_query_list("aw", queries.AW_CUSTOMER_QUERIES)
aw["source"] = "AdventureWorks2022DWH"
wwi = self._run_query_list("wwi", queries.WWI_CUSTOMER_QUERIES)
wwi["source"] = "WorldWideImporters"
return pd.concat([aw, wwi], ignore_index=True)