diff --git a/README.md b/README.md index 66dae0d..ba567df 100644 --- a/README.md +++ b/README.md @@ -136,7 +136,19 @@ System metrics (gc, requests, etc.) and several business metrics (`lotty_decide_ ### OTEL -You can enable full observability for backend by setting `OTEL_ENABLED=True` in `.env`, also you need to run `compose.prod.yaml` with `observability` profile in order to deploy full observability stack +You can enable full observability for backend by setting `OTEL_ENABLED=True` in `.env`, also you need to run `compose.prod.yaml` with `observability` profile in order to deploy full observability stack. + +![Grafana Logs](./assets/images/logs_grafana.png) +![Grafana Traces](./assets/images/tracing.png) +![Grafana Metrics](./assets/images/metrics_grafana.png) + +Also there is some other predefined dashboards for services. + +Grafana ([localhost/grafana/](http://localhost:80/grafana/)): + +login: `admin` + +password: `prooooood` ## Load testing (k6) diff --git a/artifacts/k6/20260224171916/fixture.json b/artifacts/k6/20260224171916/fixture.json new file mode 100644 index 0000000..0c6c94a --- /dev/null +++ b/artifacts/k6/20260224171916/fixture.json @@ -0,0 +1 @@ +{"run_id": "r_20260224171916", "flag_id": "c4233665-691f-4913-855e-054cb0b634c7", "flag_key": "k6_r_20260224171916_flag", "experiment_id": "3fae9bd2-d1eb-4417-8dbc-3f48bd99df16", "experiment_status": "running", "owner": "experimenter", "approver": "approver", "subject_attributes": {"country": "US"}} diff --git a/artifacts/k6/20260224171916/run.env b/artifacts/k6/20260224171916/run.env new file mode 100644 index 0000000..06a1e18 --- /dev/null +++ b/artifacts/k6/20260224171916/run.env @@ -0,0 +1,16 @@ +RUN_ID=20260224171916 +K6_IMAGE=grafana/k6:0.50.0 +K6_BASE_URL=http://host.docker.internal +FLAG_KEY=k6_r_20260224171916_flag +SUBJECT_COUNTRY=US +START_RPS=20 +RAMP_UP_RPS=100 +HOLD_RPS=100 +PRE_ALLOCATED_VUS=100 +MAX_VUS=600 +RAMP_UP_DURATION=30s +HOLD_DURATION=2m +RAMP_DOWN_DURATION=20s +THRESHOLD_ERROR_RATE=0.01 +THRESHOLD_P95_MS=250 +THRESHOLD_P99_MS=500 diff --git a/artifacts/k6/20260224171916/summary.json b/artifacts/k6/20260224171916/summary.json new file mode 100644 index 0000000..989cb7c --- /dev/null +++ b/artifacts/k6/20260224171916/summary.json @@ -0,0 +1,176 @@ +{ + "root_group": { + "id": "d41d8cd98f00b204e9800998ecf8427e", + "groups": {}, + "checks": { + "status is 200": { + "fails": 2, + "name": "status is 200", + "path": "::status is 200", + "id": "6210a8cd14cd70477eba5c5e4cb3fb5f", + "passes": 14798 + }, + "has one decision": { + "name": "has one decision", + "path": "::has one decision", + "id": "7486472dbb4301cd7f9433accea21ab5", + "passes": 14798, + "fails": 2 + } + }, + "name": "", + "path": "" + }, + "metrics": { + "vus_max": { + "value": 100, + "min": 100, + "max": 100 + }, + "data_sent": { + "rate": 22664.865759829918, + "count": 3854582 + }, + "iterations": { + "count": 14800, + "rate": 87.02370665495839 + }, + "decide_status_200_rate": { + "passes": 14798, + "fails": 2, + "thresholds": { + "rate>0.99": false + }, + "value": 0.9998648648648648 + }, + "decide_experiment_assigned_rate": { + "passes": 14798, + "fails": 2, + "value": 0.9998648648648648 + }, + "http_req_blocked": { + "med": 0.003334, + "p(90)": 0.011583, + "p(95)": 0.018917, + "p(99)": 0.19010065999999978, + "max": 16.584178, + "avg": 0.013590071283783596, + "min": 0.001208 + }, + "http_req_waiting": { + "avg": 39.85390373709466, + "min": 11.816621, + "med": 16.489178, + "p(90)": 77.800971, + "p(95)": 166.82410669999987, + "p(99)": 419.18147045999984, + "max": 1044.069215 + }, + "iteration_duration": { + "med": 16.804991, + "p(90)": 78.58091970000002, + "p(95)": 168.52110554999976, + "p(99)": 421.9347988599999, + "max": 1048.179228, + "avg": 40.33107365702705, + "min": 11.977788 + }, + "vus": { + "value": 0, + "min": 0, + "max": 36 + }, + "http_req_duration": { + "max": 1047.518767, + "avg": 40.00110370472988, + "min": 11.857496, + "med": 16.56947, + "p(90)": 78.08279640000002, + "p(95)": 167.87539784999998, + "p(99)": 420.48415555999975, + "thresholds": { + "p(95)<250": false, + "p(99)<500": false + } + }, + "http_req_connecting": { + "med": 0, + "p(90)": 0, + "p(95)": 0, + "p(99)": 0, + "max": 1.364963, + "avg": 0.003630033851351352, + "min": 0 + }, + "http_req_duration{expected_response:true}": { + "med": 16.5694485, + "p(90)": 78.08813720000002, + "p(95)": 167.87621554999998, + "p(99)": 420.52659067999974, + "max": 1047.518767, + "avg": 40.00300859345872, + "min": 11.857496 + }, + "http_req_sending": { + "p(90)": 0.040708100000000004, + "p(95)": 0.062042, + "p(99)": 0.21800307999999996, + "max": 35.933239, + "avg": 0.03309161581081086, + "min": 0.004334, + "med": 0.015917 + }, + "decide_requests_total": { + "count": 14800, + "rate": 87.02370665495839 + }, + "http_req_tls_handshaking": { + "avg": 0, + "min": 0, + "med": 0, + "p(90)": 0, + "p(95)": 0, + "p(99)": 0, + "max": 0 + }, + "http_req_receiving": { + "p(90)": 0.125209, + "p(95)": 0.23725719999999992, + "p(99)": 1.3213682599999996, + "max": 27.769673, + "avg": 0.11410835182432402, + "min": 0.006667, + "med": 0.04525 + }, + "decide_request_duration_ms": { + "min": 11.857496, + "med": 16.56947, + "p(90)": 78.08279640000002, + "p(95)": 167.87539784999998, + "p(99)": 420.48415555999975, + "max": 1047.518767, + "avg": 40.00110370472988 + }, + "http_req_failed": { + "fails": 14798, + "passes": 2, + "thresholds": { + "rate<0.01": false + }, + "value": 0.00013513513513513514 + }, + "http_reqs": { + "count": 14800, + "rate": 87.02370665495839 + }, + "checks": { + "fails": 4, + "passes": 29596, + "value": 0.9998648648648648 + }, + "data_received": { + "count": 8109462, + "rate": 47683.475825508925 + } + } +} \ No newline at end of file diff --git a/assets/images/logs_grafana.png b/assets/images/logs_grafana.png new file mode 100644 index 0000000..614a2b0 --- /dev/null +++ b/assets/images/logs_grafana.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b00fdfc586d4dce39890580806759728a5ea987a4bdb7a1b0a4d33d8ea329b1 +size 445444 diff --git a/assets/images/metrics_grafana.png b/assets/images/metrics_grafana.png new file mode 100644 index 0000000..09bcd5d --- /dev/null +++ b/assets/images/metrics_grafana.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b935a06b651bc4d51a93fe5d6ce33c0856a23e66cb6c0ef9e013477aa5c9d97c +size 161681 diff --git a/assets/images/tracing.png b/assets/images/tracing.png new file mode 100644 index 0000000..1b7f2de --- /dev/null +++ b/assets/images/tracing.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:225db12c711fc4d4c1acebd9fe0a89bb757f1cfec360042222d59b8a74181080 +size 397389 diff --git a/compliance-matrix.md b/compliance-matrix.md index daafc08..6a643eb 100644 --- a/compliance-matrix.md +++ b/compliance-matrix.md @@ -53,7 +53,7 @@ | `3.7` | `B9-2` | Нужен отдельный liveness probe | `src/backend/api/urls.py` (`/health`) | Runtime: `curl -i /health` | Поднятый backend | подтверждено | | `D.5(B9)` | `B9-3` | Без метрик нет наблюдаемости hot-path | `src/backend/config/settings/base.py`, `src/backend/apps/decision/services.py`, `src/backend/api/v1/events/endpoints.py` | Runtime: `curl /metrics` | Поднятый backend | подтверждено | | `D.5(B9)` | `B9-4` | Неструктурированные логи сложны для алертов и анализа | `src/backend/config/settings/base.py` (json formatter, django-guid) | Запуск в non-debug и просмотр stdout | Конфигурация `DJANGO_DEBUG=false` | подтверждено | -| `D.5(B9)` | `B9-6` | Рост трафика/данных может деградировать latency даже после оптимизаций | `src/backend/apps/decision/services.py`, `src/backend/apps/reports/services.py`, `src/backend/apps/events/tasks.py`, `ADR/04-decisions.md` (P1-P4) | Код-ревью + live-demo под нагрузкой | Сценарий увеличенного трафика и наблюдение очереди/latency | частично (live-demo) | +| `D.5(B9)` | `B9-6` | Рост трафика/данных может деградировать latency даже после оптимизаций | `infrastructure/k6/decide.js`, `infrastructure/k6/run-decide.sh`, `infrastructure/k6/README.md`, `src/backend/apps/decision/services.py`, `src/backend/apps/reports/services.py`, `src/backend/apps/events/tasks.py`, `ADR/04-decisions.md` (P1-P4) | Нагрузочный прогон `./infrastructure/k6/run-decide.sh` + анализ `artifacts/k6//summary.json` (p95/p99, error rate, req/s); фактические прогоны: `artifacts/k6/smoke_k6`, `artifacts/k6/20260224171916`; в прогоне `20260224171916`: `14_800` запросов, `87.02 req/s`, `p95=167.88 ms`, `p99=420.48 ms`, `error_rate=0.0135%` (2/14800) | Поднятый compose-стек и артефакты `artifacts/k6//{summary.json,run.env,fixture.json}` | частично (live-demo) | | `D.5(B9)` | `B9-7` | Без индексов и оптимизаций горячие запросы дорожают | `src/backend/apps/experiments/models.py`, `src/backend/apps/events/models.py`, `src/backend/apps/guardrails/models.py`, `src/backend/apps/notifications/models.py`, `src/backend/apps/learnings/models.py` | Схема моделей и миграций | БД-схема проекта | подтверждено | | `D.5(B10)` | `B10-1` | Отсутствие автоматического линтинга снижает качество | `src/backend/justfile`, `src/backend/pyproject.toml`, `.gitlab-ci.yml` | `cd src/backend && just lint` | Dev dependencies | подтверждено | | `D.5(B10)` | `B10-2` | Отсутствие форматирования повышает шум в diff | `src/backend/justfile`, `src/backend/pyproject.toml`, `.gitlab-ci.yml` | `cd src/backend && just format` | Dev dependencies | подтверждено | diff --git a/infrastructure/k6/run-decide.sh b/infrastructure/k6/run-decide.sh index 3ed13b9..5def719 100755 --- a/infrastructure/k6/run-decide.sh +++ b/infrastructure/k6/run-decide.sh @@ -9,8 +9,8 @@ K6_IMAGE="${K6_IMAGE:-grafana/k6:0.50.0}" K6_BASE_URL="${K6_BASE_URL:-http://host.docker.internal}" START_RPS="${START_RPS:-20}" -RAMP_UP_RPS="${RAMP_UP_RPS:-200}" -HOLD_RPS="${HOLD_RPS:-200}" +RAMP_UP_RPS="${RAMP_UP_RPS:-100}" +HOLD_RPS="${HOLD_RPS:-100}" PRE_ALLOCATED_VUS="${PRE_ALLOCATED_VUS:-100}" MAX_VUS="${MAX_VUS:-600}" RAMP_UP_DURATION="${RAMP_UP_DURATION:-30s}"