diff --git a/.opencode/agents/product-owner.md b/.opencode/agents/product-owner.md
index e211ce7..403da3c 100644
--- a/.opencode/agents/product-owner.md
+++ b/.opencode/agents/product-owner.md
@@ -25,16 +25,16 @@ Load `skill session-workflow` first — it reads TODO.md, orients you to the cur
 
 | Step | Action |
 |---|---|
-| **Step 1 — SCOPE** | Load `skill scope` — contains the full 4-phase discovery and criteria protocol |
+| **Step 1 — SCOPE** | Load `skill scope` — contains Stage 1 (Discovery sessions) and Stage 2 (Stories + Criteria). At the end of Stage 2 Step B (criteria), write the `## Self-Declaration` block into `TODO.md` before committing — every DISAGREE is a hard blocker. |
 | **Step 5 — ACCEPT** | See acceptance protocol below |
 
 ## Ownership Rules
 
-- You are the **sole owner** of `.feature` files and `docs/features/discovery.md`
+- You are the **sole owner** of `.feature` files, `docs/discovery_journal.md`, and `docs/discovery.md`
 - No other agent may edit these files
+- **You are the sole owner of all `.feature` file moves**: backlog → in-progress (before Step 2) and in-progress → completed (after Step 5 acceptance). No other agent moves `.feature` files.
 - Software-engineer escalates spec gaps to you; you decide whether to extend criteria
-- **You pick** the next feature from backlog — the software-engineer never self-selects
-- **NEVER move a feature to `in-progress/` unless its discovery section has `Status: BASELINED`** — if not baselined, complete Step 1 (Phase 2 + 3 + 4) first
+- **NEVER move a feature to `in-progress/` unless its `.feature` file has `Status: BASELINED`** — if not baselined, complete Step 1 (Stage 1 Discovery + Stage 2 Specification) first
 
 ## Step 5 — Accept
 
@@ -51,13 +51,21 @@ When a gap is reported (by software-engineer or reviewer):
 
 | Situation | Action |
 |---|---|
-| Edge case within current user stories | Add a new Example with a new `@id` to the relevant `.feature` file. |
+| Edge case within current user stories | Add a new Example to the relevant `.feature` file. |
 | New behavior beyond current stories | Add to backlog as a new feature. Do not extend the current feature. |
-| Behavior contradicts an existing Example | Write a new Example with new `@id`. |
-| Post-merge defect | Move the `.feature` file back to `in-progress/`, add new Example with `@id`, resume at Step 3. |
+| Behavior contradicts an existing Example | Add `@deprecated` to the old Example; write a new Example. |
+| Post-merge defect | Move the `.feature` file back to `in-progress/`, add new Example, resume at Step 3. |
+
+## Bug Handling
+
+When a defect is reported against any feature:
+
+1. Add a `@bug` Example to the relevant `Rule:` block in the `.feature` file using the standard `Given/When/Then` format describing the correct behavior.
+2. Update TODO.md to note the new bug Example for the SE to implement.
+3. SE implements the test in `tests/features/` **and** a `@given` Hypothesis property test in `tests/unit/`. Both are required.
 
 ## Available Skills
 
 - `session-workflow` — session start/end protocol
 - `feature-selection` — when TODO.md is idle: score and select next backlog feature using WSJF
-- `scope` — Step 1: 3-session discovery (Phase 1 + 2), stories (Phase 3), and criteria (Phase 4)
\ No newline at end of file
+- `scope` — Step 1: Stage 1 (Discovery sessions with stakeholder) and Stage 2 (Stories + Criteria, PO alone)
diff --git a/.opencode/agents/reviewer.md b/.opencode/agents/reviewer.md
index 415d07f..0f0b350 100644
--- a/.opencode/agents/reviewer.md
+++ b/.opencode/agents/reviewer.md
@@ -29,8 +29,6 @@ permissions:
 
 You verify that work is done correctly by running commands and reading code. You do not write or edit files.
 
-**Your default hypothesis is that the code is broken despite passing automated checks. Your job is to find the failure mode. If you cannot find one after thorough investigation, APPROVE. If you find one, REJECTED.**
-
 ## Session Start
 
 Load `skill session-workflow` first. Then load `skill verify` for Step 4 verification.
@@ -42,6 +40,7 @@ Load `skill session-workflow` first. Then load `skill verify` for Step 4 verific
 - **Never suggest `noqa`, `type: ignore`, or `pytest.skip` as a fix.** These are bypasses, not solutions.
 - **Report specific locations.** "`physics/engine.py:47`: unreachable return" not "there is dead code."
 - **Every PASS/FAIL cell must have evidence.** Empty evidence = UNCHECKED = REJECTED.
+- **Never move `.feature` files.** The PO is the sole owner of all feature file moves. After producing an APPROVED report, update TODO.md and stop — the PO accepts and moves the file.
 
 ## Gap Reporting
 
@@ -55,7 +54,4 @@ If you discover an observable behavior with no acceptance criterion:
 
 You never edit `.feature` files or add Examples yourself.
 
-## Available Skills
 
-- `session-workflow` — session start/end protocol
-- `verify` — Step 4: full verification protocol with all tables, gates, and report template
diff --git a/.opencode/agents/software-engineer.md b/.opencode/agents/software-engineer.md
index a802229..10bdc5e 100644
--- a/.opencode/agents/software-engineer.md
+++ b/.opencode/agents/software-engineer.md
@@ -45,7 +45,14 @@ Load `skill session-workflow` first — it reads TODO.md, orients you to the cur
 
 - You own all technical decisions: module structure, patterns, internal APIs, test tooling, linting config
 - **PO approves**: new runtime dependencies, changed entry points, scope changes
-- You are **never** the one to pick the next feature — only the PO picks from backlog
+- **You never move `.feature` files.** The PO is the sole owner of all feature file moves (backlog → in-progress → completed). If you find no `.feature` file in `docs/features/in-progress/`, **STOP** — do not self-select a feature. Write the gap in TODO.md and escalate to PO.
+
+## No In-Progress Feature
+
+If `docs/features/in-progress/` contains only `.gitkeep` (no `.feature` file):
+1. Do not pick a feature from backlog yourself.
+2. Update TODO.md: `Next: Run @product-owner — load skill feature-selection and pick the next BASELINED feature from backlog.`
+3. Stop. The PO must move the chosen feature into `in-progress/` before you can begin Step 2.
 
 ## Spec Gaps
 
@@ -61,4 +68,4 @@ If during implementation you discover behavior not covered by existing acceptanc
 - `design-patterns` — on-demand when smell detected during architecture or refactor
 - `pr-management` — Step 5: PRs with conventional commits
 - `git-release` — Step 5: calver versioning and themed release naming
-- `create-skill` — meta: create new skills when needed
\ No newline at end of file
+- `create-skill` — meta: create new skills when needed
diff --git a/.opencode/skills/create-skill/SKILL.md b/.opencode/skills/create-skill/SKILL.md
index db8a679..39504b7 100644
--- a/.opencode/skills/create-skill/SKILL.md
+++ b/.opencode/skills/create-skill/SKILL.md
@@ -141,5 +141,6 @@ Add the skill name to the agent's "Available Skills" section so the agent knows
 | `code-quality` | software-engineer | Quick reference — redirects to verify |
 | `pr-management` | software-engineer | Step 5: create PR with squash merge |
 | `git-release` | software-engineer | Step 5: calver versioning and release |
+| `living-docs` | product-owner | Step 5 (after acceptance) + on stakeholder demand: C4 diagrams + glossary |
 | `create-skill` | software-engineer | Create new skills |
 | `create-agent` | human-user | Create new agents with research-backed design |
\ No newline at end of file
diff --git a/.opencode/skills/feature-selection/SKILL.md b/.opencode/skills/feature-selection/SKILL.md
index 567e3ef..a195b20 100644
--- a/.opencode/skills/feature-selection/SKILL.md
+++ b/.opencode/skills/feature-selection/SKILL.md
@@ -38,6 +38,10 @@ Read each `.feature` file in `docs/features/backlog/`. Check its discovery secti
 - Non-BASELINED features are not eligible — they need Step 1 (scope) first
 - If no BASELINED features exist: inform the stakeholder; run `@product-owner` with `skill scope` to baseline the most promising backlog item first
 
+**IMPORTANT**
+
+**NEVER move a feature to `in-progress/` unless its discovery section has `Status: BASELINED`**
+
 ### 3. Score Each Candidate
 
 For each BASELINED feature, fill this table:
@@ -96,7 +100,7 @@ Run @<agent-name> — <first concrete action for this feature>
 ```
 
 - If the feature has no `Rule:` blocks yet → Step 1 (SCOPE): `Run @product-owner — load skill scope and write stories`
-- If the feature has `Rule:` blocks but no `@id` Examples → Step 1 Phase 4 (Criteria): `Run @product-owner — load skill scope and write acceptance criteria`
+- If the feature has `Rule:` blocks but no `@id` Examples → Step 1 Stage 2 Step B (Criteria): `Run @product-owner — load skill scope and write acceptance criteria`
 - If the feature has `@id` Examples → Step 2 (ARCH): `Run @software-engineer — load skill implementation and write architecture stubs`
 
 ### 6. Commit
diff --git a/.opencode/skills/git-release/SKILL.md b/.opencode/skills/git-release/SKILL.md
index 34bd11f..fe85972 100644
--- a/.opencode/skills/git-release/SKILL.md
+++ b/.opencode/skills/git-release/SKILL.md
@@ -82,17 +82,29 @@ Add at the top:
 - description (#PR-number)
 ```
 
-### 5. Regenerate lockfile and commit version bump
+### 5. Update living docs
+
+Run the `living-docs` skill to reflect the newly accepted feature in C4 diagrams and the glossary. This step runs inline — do not commit separately.
+
+Load and execute the full `living-docs` skill now:
+- Update `docs/c4/context.md` (C4 Level 1)
+- Update `docs/c4/container.md` (C4 Level 2, if multi-container)
+- Update `docs/glossary.md` (living glossary)
+
+The `living-docs` commit step is **skipped** here — all changed files are staged together with the version bump in step 6.
+
+### 6. Regenerate lockfile and commit version bump
 
 After updating `pyproject.toml`, regenerate the lockfile — CI runs `uv sync --locked` and will fail if it is stale:
 
 ```bash
 uv lock
-git add pyproject.toml <package>/__init__.py CHANGELOG.md uv.lock
+git add pyproject.toml <package>/__init__.py CHANGELOG.md uv.lock \
+  docs/c4/context.md docs/c4/container.md docs/glossary.md
 git commit -m "chore(release): bump version to v{version} - {Adjective Animal}"
 ```
 
-### 6. Create GitHub release
+### 7. Create GitHub release
 
 Assign the SHA first so it expands correctly inside the notes string:
 
@@ -123,7 +135,7 @@ gh release create "v{version}" \
 **SHA**: \`${SHA}\`"
 ```
 
-### 7. If a hotfix commit follows the release tag
+### 8. If a hotfix commit follows the release tag
 
 If CI fails after the release (e.g. a stale lockfile) and a hotfix commit is pushed, reassign the tag and GitHub release to that commit:
 
@@ -151,6 +163,7 @@ The release notes and title do not need to change — only the target commit mov
 - [ ] `uv lock` run after version bump — lockfile must be up to date
 - [ ] `<package>/__version__` matches `pyproject.toml` version
 - [ ] CHANGELOG.md updated
+- [ ] `living-docs` skill run — C4 diagrams and glossary reflect the new feature
 - [ ] Release name not used before
 - [ ] Release notes follow the template format
 - [ ] If a hotfix was pushed after the tag: tag reassigned to hotfix commit
diff --git a/.opencode/skills/implementation/SKILL.md b/.opencode/skills/implementation/SKILL.md
index 0cc4b7f..61ada18 100644
--- a/.opencode/skills/implementation/SKILL.md
+++ b/.opencode/skills/implementation/SKILL.md
@@ -15,12 +15,12 @@ Steps 2 (Architecture) and 3 (TDD Loop) combined into a single skill. The softwa
 
 During implementation, correctness priorities are (in order):
 
-1. **Design correctness** — YAGNI > KISS > DRY > SOLID > Object Calisthenics > appropriate design patterns
+1. **Design correctness** — YAGNI > KISS > DRY > SOLID > Object Calisthenics > appropriate design patterns > complex code > complicated code > failing code > no code
 2. **One @id green** — the specific test under work passes, plus `test-fast` still passes
 3. **Commit** — when a meaningful increment is green
 4. **Quality tooling** — `lint`, `static-check`, full `test` with coverage run at end-of-feature handoff
 
-Design correctness is far more important than lint/pyright/coverage compliance. Never run lint, static-check, or coverage during the TDD loop — those are handoff-only checks.
+Design correctness is far more important than lint/pyright/coverage compliance. Never run lint (ruff check, ruff format), static-check (pyright), or coverage during the TDD loop — those are handoff-only checks.
 
 ---
 
@@ -28,7 +28,7 @@ Design correctness is far more important than lint/pyright/coverage compliance.
 
 ### Prerequisites (stop if any fail — escalate to PO)
 
-1. `docs/features/in-progress/` contains only `.gitkeep` (no `.feature` files). If another `.feature` file exists, **STOP** — another feature is already in progress.
+1. `docs/features/in-progress/` contains exactly one `.feature` file (not just `.gitkeep`). If none exists, **STOP** — update TODO.md `Next:` to `Run @product-owner — move the chosen feature to in-progress/` and stop. Never self-select or move a feature yourself.
 2. The feature file's discovery section has `Status: BASELINED`. If not, escalate to PO — Step 1 is incomplete.
 3. The feature file contains `Rule:` blocks with `Example:` blocks and `@id` tags. If not, escalate to PO — criteria have not been written.
 4. Package name confirmed: read `pyproject.toml` → locate `[tool.setuptools]` → confirm directory exists on disk.
@@ -37,26 +37,21 @@ Design correctness is far more important than lint/pyright/coverage compliance.
 
 1. Read `pyproject.toml` → locate `[tool.setuptools]` → record `packages = ["<name>"]`
 2. Confirm directory exists: `ls <name>/`
-3. All new source files go under `<name>/` — never under a template placeholder.
+3. All new source files go under `<name>/`
 
-### Move Feature File
-
-```bash
-mv docs/features/backlog/<name>.feature docs/features/in-progress/<name>.feature
-```
-
-Update `TODO.md` Source path from `backlog/` to `in-progress/`.
+**Note on feature file moves**: The PO moves `.feature` files between folders. The software-engineer never moves or edits `.feature` files. Update TODO.md `Source:` path to reflect `in-progress/` once the PO has moved the file.
 
 ### Read Phase (all before writing anything)
 
-1. Read `docs/features/discovery.md` (project-level)
-2. Read **ALL** `.feature` files in `docs/features/backlog/` (discovery + entities sections)
-3. Read in-progress `.feature` file (full: Rules + Examples + @id)
-4. Read **ALL** existing `.py` files in `<package>/` — understand what already exists before adding anything
+1. Read `docs/discovery.md` (project-level synthesis changelog) and optionally `docs/discovery_journal.md` (Q&A history for context)
+2. Read `docs/glossary.md` if it exists — use existing domain terms when naming classes, methods, and modules; do not invent synonyms for terms already defined
+3. Read **ALL** `.feature` files in `docs/features/backlog/` (discovery + entities sections)
+4. Read in-progress `.feature` file (full: Rules + Examples + @id)
+5. Read **ALL** existing `.py` files in `<package>/` — understand what already exists before adding anything
 
 ### Domain Analysis
 
-From Entities table + Rules (Business) in `.feature` file:
+From the Domain Model table in `docs/discovery.md` + Rules (Business) in the `.feature` file:
 - **Nouns** → named classes, value objects, aggregates
 - **Verbs** → method names with typed signatures
 - **Datasets** → named types (not bare dict/list)
@@ -116,19 +111,20 @@ class UserRepository(Protocol):
 
 Place stubs where responsibility dictates — do not pre-create `ports/` or `adapters/` folders unless a concrete external dependency was identified in scope. Structure follows domain analysis, not a template.
 
-### Write ADR Files (significant decisions only)
+### Record Architectural Decisions
 
-For each significant architectural decision, create `docs/architecture/adr-NNN-<title>.md`:
+Append a new dated block to `docs/architecture.md` for each significant decision:
 
 ```markdown
-# ADR-NNN: <title>
+## YYYY-MM-DD — <feature-stem>: <short title>
 
-**Decision:** <what was decided>
-**Reason:** <why, one sentence>
-**Alternatives considered:** <what was rejected and why>
+Decision: <what was decided>
+Reason: <why, one sentence>
+Alternatives considered: <what was rejected and why>
+Feature: <feature-stem>
 ```
 
-Only write an ADR if the decision is non-obvious or has meaningful trade-offs. Routine YAGNI choices do not need an ADR.
+Only write a block for non-obvious decisions with meaningful trade-offs. Routine YAGNI choices do not need a record.
 
 ### Architecture Smell Check (hard gate)
 
@@ -145,7 +141,11 @@ Apply to the stub files just written:
 
 If any check fails: fix the stub files before committing.
 
-Commit: `feat(<feature-name>): add architecture stubs`
+### Generate Test Stubs
+
+Run `uv run task test-fast` once. It reads the in-progress `.feature` file, assigns `@id` tags to any untagged `Example:` blocks (writing them back to the `.feature` file), and generates `tests/features/<feature_slug>/<rule_slug>_test.py` — one file per `Rule:` block, one skipped function per `@id`. Verify the files were created, then stage all changes (including any `@id` write-backs to the `.feature` file).
+
+Commit: `feat(<feature-stem>): add architecture and test stubs`
 
 ---
 
@@ -153,35 +153,17 @@ Commit: `feat(<feature-name>): add architecture stubs`
 
 ### Prerequisites
 
+- [ ] Exactly one .feature `in_progress`. If not present, Load `skill feature-selection` 
 - [ ] Architecture stubs present in `<package>/` (committed by Step 2)
-- [ ] Read all `docs/architecture/adr-NNN-*.md` files — understand the architectural decisions before writing any test
-- [ ] Test stub files exist in `tests/features/<feature-name>/` — one file per `Rule:` block, all `@id` functions present with `@pytest.mark.skip`; if missing, write them now before entering RED
-
-### Write Test Stubs (if not present)
-
-For each `Rule:` block in the in-progress `.feature` file, create `tests/features/<feature-name>/<rule-slug>_test.py` if it does not already exist. Write one function per `@id` Example, all skipped:
-
-```python
-@pytest.mark.skip(reason="not yet implemented")
-def test_<rule_slug>_<8char_hex>() -> None:
-    """
-    Given: ...
-    When: ...
-    Then: ...
-    """
-    # Given
-    # When
-    # Then
-```
-
-Run `uv run task gen-todo` after writing stubs to sync `@id` rows into `TODO.md`.
+- [ ] Read `docs/architecture.md` — understand all architectural decisions before writing any test
+- [ ] Test stub files exist in `tests/features/<feature_slug>/<rule_slug>_test.py` — generated by pytest-beehave at Step 2 end; if missing, re-run `uv run task test-fast` and commit the generated files before entering RED
 
 ### Build TODO.md Test List
 
 1. List all `@id` tags from in-progress `.feature` file
 2. Order: fewest dependencies first; most impactful within that set
 3. Each `@id` = one TODO item, status: `pending`
-4. Confirm each `@id` has a corresponding skipped stub in `tests/features/<feature-name>/` — if any are missing, add them before proceeding
+4. Confirm each `@id` has a corresponding skipped stub in `tests/features/<feature_slug>/` — if any are missing, add them before proceeding
 
 ### Outer Loop — One @id at a time
 
@@ -192,17 +174,17 @@ For each pending `@id`:
 ```
 INNER LOOP
 ├── RED
-│   ├── Confirm stub for this @id exists in tests/features/<feature-name>/ with @pytest.mark.skip
+│   ├── Confirm stub for this @id exists in tests/features/<feature_slug>/<rule_slug>_test.py with @pytest.mark.skip
 │   ├── Read existing stubs in `<package>/` — base the test on the current data model and signatures
 │   ├── Write test body (Given/When/Then → Arrange/Act/Assert); remove @pytest.mark.skip
-│   ├── Update stub signatures as needed — edit the `.py` file directly
+│   ├── Update <package> stub signatures as needed — edit the `.py` file directly
 │   ├── uv run task test-fast
 │   └── EXIT: this @id FAILS
 │       (if it passes: test is wrong — fix it first)
 │
 ├── GREEN
 │   ├── Write minimum code — YAGNI + KISS only
-│   │   (no DRY, SOLID, OC here — those belong in REFACTOR)
+│   │   (no DRY, SOLID, OC, Docstring, type hint here — those belong in REFACTOR)
 │   ├── uv run task test-fast
 │   └── EXIT: this @id passes AND all prior tests pass
 │       (fix implementation only; do not advance to next @id)
@@ -221,7 +203,7 @@ Commit when a meaningful increment is green
 ```bash
 uv run task lint
 uv run task static-check
-uv run task test          # coverage must be 100%
+uv run task test-coverage          # coverage must be 100%
 timeout 10s uv run task run
 ```
 
@@ -231,44 +213,43 @@ All must pass before Self-Declaration.
 
 ### Self-Declaration (once, after all quality gates pass)
 
-Write into `TODO.md` under a `## Self-Declaration` block:
-
-```markdown
-## Self-Declaration
-As a software-engineer I declare:
-* YAGNI: no code without a failing test — AGREE/DISAGREE | file:line
-* YAGNI: no speculative abstractions — AGREE/DISAGREE | file:line
-* KISS: simplest solution that passes — AGREE/DISAGREE | file:line
-* KISS: no premature optimization — AGREE/DISAGREE | file:line
-* DRY: no duplication — AGREE/DISAGREE | file:line
-* DRY: no redundant comments — AGREE/DISAGREE | file:line
-* SOLID-S: one reason to change per class — AGREE/DISAGREE | file:line
-* SOLID-O: open for extension, closed for modification — AGREE/DISAGREE | file:line
-* SOLID-L: subtypes substitutable — AGREE/DISAGREE | file:line
-* SOLID-I: no forced unused deps — AGREE/DISAGREE | file:line
-* SOLID-D: depend on abstractions, not concretions — AGREE/DISAGREE | file:line
-* OC-1: one level of indentation per method — AGREE/DISAGREE | deepest: file:line
-* OC-2: no else after return — AGREE/DISAGREE | file:line
-* OC-3: primitive types wrapped — AGREE/DISAGREE | file:line
-* OC-4: first-class collections — AGREE/DISAGREE | file:line
-* OC-5: one dot per line — AGREE/DISAGREE | file:line
-* OC-6: no abbreviations — AGREE/DISAGREE | file:line
-* OC-7: ≤20 lines per function, ≤50 per class — AGREE/DISAGREE | longest: file:line
-* OC-8: ≤2 instance variables per class (behavioural classes only; dataclasses, Pydantic models, value objects, and TypedDicts are exempt) — AGREE/DISAGREE | file:line
-* OC-9: no getters/setters — AGREE/DISAGREE | file:line
-* Patterns: no creational smell — AGREE/DISAGREE | file:line
-* Patterns: no structural smell — AGREE/DISAGREE | file:line
-* Patterns: no behavioral smell — AGREE/DISAGREE | file:line
-* Semantic: tests operate at same abstraction as AC — AGREE/DISAGREE | file:line
-```
-
-A `DISAGREE` answer is not automatic rejection — state the reason inline and fix before handing off.
+<!-- This list has exactly 25 items — count before submitting. If your count ≠ 25, you missed one. -->
+
+Communicate verbally to the reviewer. Answer honestly for each principle:
+
+1. YAGNI: no code without a failing test — AGREE/DISAGREE | file:line
+2. YAGNI: no speculative abstractions — AGREE/DISAGREE | file:line
+3. KISS: simplest solution that passes — AGREE/DISAGREE | file:line
+4. KISS: no premature optimization — AGREE/DISAGREE | file:line
+5. DRY: no duplication — AGREE/DISAGREE | file:line
+6. DRY: no redundant comments — AGREE/DISAGREE | file:line
+7. SOLID-S: one reason to change per class — AGREE/DISAGREE | file:line
+8. SOLID-O: open for extension, closed for modification — AGREE/DISAGREE | file:line
+9. SOLID-L: subtypes substitutable — AGREE/DISAGREE | file:line
+10. SOLID-I: no forced unused deps — AGREE/DISAGREE | file:line
+11. SOLID-D: depend on abstractions, not concretions — AGREE/DISAGREE | file:line
+12. OC-1: one level of indentation per method — AGREE/DISAGREE | deepest: file:line
+13. OC-2: no else after return — AGREE/DISAGREE | file:line
+14. OC-3: primitive types wrapped — AGREE/DISAGREE | file:line
+15. OC-4: first-class collections — AGREE/DISAGREE | file:line
+16. OC-5: one dot per line — AGREE/DISAGREE | file:line
+17. OC-6: no abbreviations — AGREE/DISAGREE | file:line
+18. OC-7: ≤20 lines per function, ≤50 per class — AGREE/DISAGREE | longest: file:line
+19. OC-8: ≤2 instance variables per class (behavioural classes only; dataclasses, Pydantic models, value objects, and TypedDicts are exempt) — AGREE/DISAGREE | file:line
+20. OC-9: no getters/setters — AGREE/DISAGREE | file:line
+21. Patterns: no good reason remains to refactor using OOP or Design Patterns — AGREE/DISAGREE | file:line
+22. Patterns: no creational smell — AGREE/DISAGREE | file:line
+23. Patterns: no structural smell — AGREE/DISAGREE | file:line
+24. Patterns: no behavioral smell — AGREE/DISAGREE | file:line
+25. Semantic: tests operate at same abstraction as AC — AGREE/DISAGREE | file:line
+
+A `DISAGREE` answer is not automatic rejection — state the reason and fix before handing off.
 
 ### Hand off to Step 4 (Verify)
 
 Signal completion to the reviewer. Provide:
 - Feature file path
-- Self-Declaration from TODO.md
+- Self-Declaration (communicated verbally, as above)
 - Summary of what was implemented
 
 ---
@@ -278,20 +259,20 @@ Signal completion to the reviewer. Provide:
 ### Test File Layout
 
 ```
-tests/features/<feature-name>/<rule-slug>_test.py
+tests/features/<feature_slug>/<rule_slug>_test.py
 ```
 
-- `<feature-name>` = the `.feature` file stem
-- `<rule-slug>` = the `Rule:` title slugified
+- `<feature_slug>` = the `.feature` file stem with hyphens replaced by underscores, lowercase
+- `<rule_slug>` = the `Rule:` title slugified (lowercase, underscores)
 
 ### Function Naming
 
 ```python
-def test_<rule_slug>_<8char_hex>() -> None:
+def test_<feature_slug>_<@id>() -> None:
 ```
 
-- `rule_slug` = the `Rule:` title with spaces/hyphens replaced by underscores, lowercase
-- `8char_hex` = the `@id` from the `Example:` block
+- `feature_slug` = the `.feature` file stem with spaces/hyphens replaced by underscores, lowercase
+- `@id` = the `@id` from the `Example:` block
 
 ### Docstring Format (mandatory)
 
@@ -299,27 +280,23 @@ New tests start as skipped stubs. Remove `@pytest.mark.skip` when implementing i
 
 ```python
 @pytest.mark.skip(reason="not yet implemented")
-def test_wall_bounce_a3f2b1c4() -> None:
+def test_<feature_slug>_<@id>() -> None:
     """
-    Given: A ball moving upward reaches y=0
-    When: The physics engine processes the next frame
-    Then: The ball velocity y-component becomes positive
+    <@id steps raw text including new lines>
     """
-    # Given
-    # When
-    # Then
 ```
 
 **Rules**:
-- Docstring contains `Given:/When:/Then:` on separate indented lines
+- Docstring contains `Gherkin steps` as raw text on separate indented lines
 - No extra metadata in docstring — traceability comes from function name `@id` suffix
 
 ### Markers
 
 - `@pytest.mark.slow` — takes > 50ms (Hypothesis, DB, network, terminal I/O)
-- `@pytest.mark.deprecated` — auto-skipped by conftest; used for superseded Examples
+- `@pytest.mark.deprecated` — auto-skipped by pytest-beehave; used for superseded Examples
 
 ```python
+@pytest.mark.deprecated
 def test_wall_bounce_a3f2b1c4() -> None:
     ...
 
@@ -350,11 +327,11 @@ def test_wall_bounce_c4d5e6f7(x: float) -> None:
 **Rules**:
 - `@pytest.mark.slow` is mandatory on every `@given`-decorated test
 - `@example(...)` is optional but encouraged
-- Never use Hypothesis for: I/O, side effects, network calls, database writes
+- Do not use Hypothesis for: I/O, side effects, network calls, database writes
 
 ### Semantic Alignment Rule
 
-The test's Given/When/Then must operate at the **same abstraction level** as the AC's Given/When/Then.
+The test's Given/When/Then must operate at the **same abstraction level** as the AC's Steps.
 
 | AC says | Test must do |
 |---|---|
@@ -369,7 +346,7 @@ If testing through the real entry point is infeasible, escalate to PO to adjust
 - No `isinstance()`, `type()`, or internal attribute (`_x`) checks in assertions
 - One assertion concept per test (multiple `assert` ok if they verify the same thing)
 - No `pytest.mark.xfail` without written justification
-- `pytest.mark.skip` is only valid on stubs (`reason="not yet implemented"`) — remove it when implementing
+- `pytest.mark.skip(reason="not yet implemented")` is only valid on stubs — remove it when implementing
 - Test data embedded directly in the test, not loaded from external files
 
 ### Test Tool Decision
@@ -396,7 +373,7 @@ Extra tests in `tests/unit/` are allowed freely (coverage, edge cases, etc.) —
 
 ## Signature Design
 
-Signatures are written during Step 2 (Architecture) and refined during Step 3 (RED). They live directly in the package `.py` files — never in the `.feature` file.
+<package> signatures are written during Step 2 (Architecture) and refined during Step 3 (RED). They live directly in the package `.py` files — never in the `.feature` file.
 
 Key rules:
 - Bodies are always `...` in the architecture stub
@@ -420,4 +397,4 @@ class EmailAddress:
 class UserRepository(Protocol):
     def save(self, user: "User") -> None: ...
     def find_by_email(self, email: EmailAddress) -> "User | None": ...
-```
\ No newline at end of file
+```
diff --git a/.opencode/skills/living-docs/SKILL.md b/.opencode/skills/living-docs/SKILL.md
new file mode 100644
index 0000000..8472547
--- /dev/null
+++ b/.opencode/skills/living-docs/SKILL.md
@@ -0,0 +1,213 @@
+---
+name: living-docs
+description: Generate and update C4 architecture diagrams and the living glossary from existing project docs
+version: "1.0"
+author: product-owner
+audience: product-owner
+workflow: feature-lifecycle
+---
+
+# Living Docs
+
+This skill generates and updates two living documents after a feature is accepted (Step 5) or on stakeholder request: the **C4 architecture diagrams** and the **living glossary**. Both are derived from existing project documentation — no new decisions are made.
+
+The glossary is a secondary artifact derived from the code, the domain model, and domain-expert conversations. The canonical sources are the completed feature files, the discovery synthesis, and the architectural decisions. The glossary is a human-readable projection of those sources — not an independent authority.
+
+## When to Use
+
+- **As part of the release process (Step 5)** — the `git-release` skill calls this skill inline at step 5, before the version-bump commit. Do not commit separately; the release process stages all files together.
+- **Stakeholder on demand** — when the stakeholder asks "what does the system look like?" or "what does term X mean in this context?". In this case, commit with the standalone message in Step 5 below.
+
+## Ownership Rules
+
+| Document | Created/Updated by | Inputs read |
+|---|---|---|
+| `docs/c4/context.md` | `living-docs` skill (PO) | `docs/discovery.md`, `docs/features/completed/` |
+| `docs/c4/container.md` | `living-docs` skill (PO) | `docs/architecture.md`, `docs/features/completed/` |
+| `docs/glossary.md` | `living-docs` skill (PO) | `docs/discovery.md`, `docs/glossary.md` (existing), `docs/architecture.md`, `docs/features/completed/` |
+| `docs/architecture.md` | SE only (Step 2) | — |
+| `docs/discovery.md` | PO only (Step 1) | — |
+
+**Never edit `docs/architecture.md` or `docs/discovery.md` in this skill.** Those files are append-only by their respective owners. This skill reads them; it never writes to them.
+
+---
+
+## Step 1 — Read Phase (all before writing anything)
+
+Read in this order:
+
+1. `docs/discovery.md` — project scope, domain model (nouns/verbs), feature list per session
+2. `docs/features/completed/` — all completed `.feature` files (full text: Rules, Examples, Constraints)
+3. `docs/architecture.md` — all architectural decisions (containers, modules, protocols, external deps)
+4. `docs/c4/` — existing C4 diagrams if they exist (update, do not replace from scratch)
+5. `docs/glossary.md` — existing glossary if it exists (extend, never remove existing entries)
+
+Identify from the read phase:
+
+- **Actors** — named human roles from feature `As a <role>` clauses and discovery Scope section
+- **External systems** — any system outside the package boundary named in features or architecture decisions
+- **Containers** — deployable/runnable units identified in `docs/architecture.md` (Hexagonal adapters, CLIs, services)
+- **Key domain terms** — all nouns from `docs/discovery.md` Domain Model tables, plus any terms defined in `docs/architecture.md` decisions
+
+---
+
+## Step 2 — Update C4 Context Diagram (Level 1)
+
+File: `docs/c4/context.md`
+
+The Context diagram answers: **who uses the system and what external systems does it interact with?**
+
+Use Mermaid `C4Context` syntax. Template:
+
+```markdown
+# C4 — System Context
+
+> Last updated: YYYY-MM-DD
+> Source: docs/discovery.md, docs/features/completed/
+
+```mermaid
+C4Context
+  title System Context — <project-name>
+
+  Person(actor1, "<role name>", "<one-line description from feature As a clauses>")
+
+  System(system, "<project-name>", "<3–5 word system description from discovery.md Scope>")
+
+  System_Ext(ext1, "<external system name>", "<what it provides>")
+
+  Rel(actor1, system, "<verb from When clause>")
+  Rel(system, ext1, "<verb from architecture.md decision>")
+```
+```
+
+Rules:
+- One `Person(...)` per distinct actor found in completed feature files
+- One `System_Ext(...)` per external dependency identified in `docs/architecture.md` decisions
+- Relationships (`Rel`) use verb phrases from feature `When` clauses or architecture decision labels
+- If no external systems are identified in `docs/architecture.md`, omit `System_Ext` entries
+- If the file already exists: update only — add new actors/systems, update relationship labels. Never remove an existing entry unless the feature it came from has been explicitly superseded
+
+---
+
+## Step 3 — Update C4 Container Diagram (Level 2)
+
+File: `docs/c4/container.md`
+
+The Container diagram answers: **what are the major runnable/deployable units and how do they communicate?**
+
+Only generate this diagram if `docs/architecture.md` contains at least one decision identifying a distinct container boundary (e.g., a CLI entry point separate from a library, a web server, a background worker, an external service adapter). If the project is a single-container system, note this in the file and skip the diagram body.
+
+Use Mermaid `C4Container` syntax. Template:
+
+```markdown
+# C4 — Container Diagram
+
+> Last updated: YYYY-MM-DD
+> Source: docs/architecture.md
+
+```mermaid
+C4Container
+  title Container Diagram — <project-name>
+
+  Person(actor1, "<role>", "")
+
+  System_Boundary(sys, "<project-name>") {
+    Container(container1, "<name>", "<technology>", "<responsibility from architecture.md>")
+    Container(container2, "<name>", "<technology>", "<responsibility>")
+  }
+
+  System_Ext(ext1, "<external system>", "")
+
+  Rel(actor1, container1, "<action>")
+  Rel(container1, container2, "<protocol or method>")
+  Rel(container1, ext1, "<protocol>")
+```
+```
+
+Rules:
+- Container names and responsibilities come directly from `docs/architecture.md` decisions — do not invent them
+- Technology labels come from `pyproject.toml` dependencies when identifiable (e.g., "Python / fire CLI", "Python / FastAPI")
+- If the file already exists: update incrementally — do not regenerate from scratch
+
+---
+
+## Step 4 — Update Living Glossary
+
+File: `docs/glossary.md`
+
+The glossary answers: **what does each domain term mean in this project's context?**
+
+### Format
+
+```markdown
+# Glossary — <project-name>
+
+> Living document. Updated after each completed feature by the `living-docs` skill.
+> Source: docs/discovery.md (Domain Model), docs/features/completed/, docs/architecture.md
+
+---
+
+## <Term>
+
+**Type:** Noun | Verb | Domain Event | Concept | Role | External System
+
+**Definition:** <one sentence, plain English, no jargon>
+
+**Bounded context:** <name of the bounded context where this term is defined; required when the project has more than one bounded context; omit only for single-context projects>
+
+**First appeared:** <YYYY-MM-DD discovery session or feature name>
+
+---
+```
+
+### Rules
+
+- Extract all nouns and verbs from every `### Domain Model` table in `docs/discovery.md`
+- Extract all roles from `As a <role>` clauses in completed `.feature` files
+- Extract all external system names from `docs/architecture.md` decisions
+- Extract any term defined or clarified in architectural decision `Reason:` fields
+- **Do not remove existing glossary entries** — if a term's meaning has changed, add a `**Superseded by:**` line pointing to the new entry and write a new entry
+- **Every term must have a traceable source** — completed feature files or `docs/architecture.md` decisions. If a term appears in sources but is never defined, write `Definition: Term appears in [source] but has not been explicitly defined.` Do not invent a definition.
+- Terms are sorted alphabetically within the file
+
+### Merge with existing glossary
+
+If `docs/glossary.md` already exists:
+1. Read all existing entries
+2. For each new term found in sources: check if it already exists in the glossary
+   - Exists, definition unchanged → skip
+   - Exists, definition changed → append `**Superseded by:** <new-term-or-date>` to old entry; write new entry
+   - Does not exist → append new entry in alphabetical order
+
+---
+
+## Step 5 — Commit
+
+**When called from the release process**: skip this step — the `git-release` skill stages and commits all files together.
+
+**When run standalone** (stakeholder on demand): commit after all diagrams and glossary are updated:
+
+```
+docs(living-docs): update C4 and glossary after <feature-stem>
+```
+
+If triggered without a specific feature (general refresh):
+
+```
+docs(living-docs): refresh C4 diagrams and glossary
+```
+
+---
+
+## Checklist
+
+- [ ] Read all five source files before writing anything
+- [ ] Context diagram reflects all actors from completed feature files
+- [ ] Context diagram reflects all external systems from `docs/architecture.md`
+- [ ] Container diagram present only if multi-container architecture confirmed in `docs/architecture.md`
+- [ ] Glossary contains all nouns and verbs from `docs/discovery.md` Domain Model tables
+- [ ] No existing glossary entry removed
+- [ ] Every new term has a traceable source in completed feature files or `docs/architecture.md`; no term is invented
+- [ ] No edits made to `docs/architecture.md` or `docs/discovery.md`
+- [ ] If standalone: committed with `docs(living-docs): ...` message
+- [ ] If called from release: files staged but not committed (release process commits)
diff --git a/.opencode/skills/pr-management/SKILL.md b/.opencode/skills/pr-management/SKILL.md
index f10605c..94af430 100644
--- a/.opencode/skills/pr-management/SKILL.md
+++ b/.opencode/skills/pr-management/SKILL.md
@@ -14,7 +14,7 @@ Create and manage pull requests after the reviewer approves the feature (Step 5)
 ## Branch Naming
 
 ```
-feature/<feature-name>    # new feature
+feature/<feature-stem>    # new feature
 fix/<issue-description>   # bug fix
 refactor/<scope>          # refactoring
 docs/<scope>              # documentation
@@ -42,7 +42,7 @@ git commit -m "chore(deps): add python-dotenv dependency"
 
 ```bash
 # Push branch
-git push -u origin feature/<feature-name>
+git push -u origin feature/<feature-stem>
 
 # Create PR
 gh pr create \
diff --git a/.opencode/skills/refactor/SKILL.md b/.opencode/skills/refactor/SKILL.md
index 6d84a2e..208d12d 100644
--- a/.opencode/skills/refactor/SKILL.md
+++ b/.opencode/skills/refactor/SKILL.md
@@ -265,9 +265,9 @@ Refactoring commits are always **separate** from feature commits.
 
 | Commit type | Message format | When |
 |---|---|---|
-| Preparatory refactoring | `refactor(<feature-name>): <what>` | Before RED, to make the feature easier |
-| REFACTOR phase | `refactor(<feature-name>): <what>` | After GREEN, cleaning up the green code |
-| Feature addition | `feat(<feature-name>): <what>` | After GREEN (never mixed with refactor) |
+| Preparatory refactoring | `refactor(<feature-stem>): <what>` | Before RED, to make the feature easier |
+| REFACTOR phase | `refactor(<feature-stem>): <what>` | After GREEN, cleaning up the green code |
+| Feature addition | `feat(<feature-stem>): <what>` | After GREEN (never mixed with refactor) |
 
 Never mix a structural cleanup with a behavior addition in one commit. This keeps history bisectable and CI green at every commit.
 
diff --git a/.opencode/skills/scope/SKILL.md b/.opencode/skills/scope/SKILL.md
index 14af5f6..cd02bd5 100644
--- a/.opencode/skills/scope/SKILL.md
+++ b/.opencode/skills/scope/SKILL.md
@@ -1,7 +1,7 @@
 ---
 name: scope
 description: Step 1 — discover requirements through stakeholder interviews and write Gherkin acceptance criteria
-version: "4.0"
+version: "5.0"
 author: product-owner
 audience: product-owner
 workflow: feature-lifecycle
@@ -13,20 +13,18 @@ This skill guides the PO through Step 1 of the feature lifecycle: interviewing t
 
 ## When to Use
 
-When the PO is starting a new project or a new feature. The output is a set of `.feature` files in `docs/features/backlog/`.
+When the PO is starting a new project, adding features, or refining an existing feature. The output is a set of `.feature` files in `docs/features/backlog/` ready for development.
 
 ## Overview
 
-Step 1 has 4 phases:
+Step 1 has two stages:
 
-| Phase | Who | Output |
+| Stage | Who | Output |
 |---|---|---|
-| 1. Project Discovery | PO + stakeholder | `docs/features/discovery.md` + feature list |
-| 2. Feature Discovery | PO + stakeholder | Discovery section embedded in `docs/features/backlog/<name>.feature` |
-| 3. Stories | PO alone | `Rule:` blocks in the `.feature` file (no Examples) |
-| 4. Criteria | PO alone | `Example:` blocks with `@id` tags under each `Rule:` |
+| **Stage 1 — Discovery** | PO + stakeholder | `docs/discovery_journal.md` (Q&A) + `docs/discovery.md` (synthesis) + `.feature` descriptions |
+| **Stage 2 — Specification** | PO alone | `Rule:` blocks + `Example:` blocks with `@id` tags in `.feature` files |
 
-Each phase produces a template-gated deliverable. A section must be complete and confirmed before the next section unlocks. Template enforcement is the process discipline — not a "baseline" command.
+Stage 1 is iterative and ongoing — sessions happen whenever the PO or stakeholder needs to discover or refine scope. Stage 2 runs per feature, only after that feature has `Status: BASELINED`.
 
 ---
 
@@ -64,21 +62,37 @@ Three levels of active listening apply throughout every interview session:
 
 - **Level 1 — Per answer**: immediately paraphrase each answer before moving to the next question. "So if I understand correctly, you're saying that X happens when Y?" Catches misunderstanding in the moment.
 - **Level 2 — Per group**: brief synthesis when transitioning between behavior groups. "We've covered [area A] and [area B]. Before I ask about [area C], here is what I understood so far: [summary]. Does that capture it?" Confirms completeness, gives stakeholder a recovery point.
-- **Level 3 — End of session**: full synthesis of everything discussed. Present to stakeholder for approval. This is the accuracy gate, the baseline signal, and the input to domain modeling.
+- **Level 3 — End of session**: full synthesis of everything discussed. Present to stakeholder for approval. This is the accuracy gate and the input to domain modeling.
 
 Do not introduce topic labels or categories during active listening. The summary must reflect what the stakeholder said, not new framing that prompts reactions to things they haven't considered.
 
 ---
 
-## Phase 1 — Project Discovery
+## Stage 1 — Discovery
 
-**When**: Once per project, before any features are scoped. **Skip entirely if `discovery.md` Status is `BASELINED`.** Adding features to an existing project: append new questions to Session 1 and re-fill from there.
+Discovery is a continuous, iterative process. Sessions happen whenever scope needs to be established or refined — for a new project, for a new feature, or when new information emerges. There is no "Phase 1" vs "Phase 2" distinction; every session follows the same structure.
 
-### Session 1 — Individual Scope Elicitation
+### Session Start (every session)
 
-**Before the session**: Create `docs/features/discovery.md` using the project-level discovery template. Open to the Session 1 section.
+**Before asking any questions:**
 
-**Ask the 7 standard questions** (present all at once):
+1. Check `docs/discovery_journal.md` for the most recent session block.
+   - If the most recent block has `Status: IN-PROGRESS` → the previous session was interrupted. Resume it: check which `.feature` files need updating (compare journal Q&A against current `.feature` descriptions), write the `discovery.md` synthesis block if missing, then mark the block `Status: COMPLETE`. Only then begin a new session.
+   - If `docs/discovery_journal.md` does not exist → this is the first session. Create both `docs/discovery_journal.md` and `docs/discovery.md` using the templates at the end of this skill.
+2. Open `docs/discovery_journal.md` and append a new session header:
+   ```markdown
+   ## YYYY-MM-DD — Session N
+   Status: IN-PROGRESS
+   ```
+   Write this header **before** asking any questions. This is the durability marker — if the session is interrupted, the next agent sees `IN-PROGRESS` and knows writes are pending.
+
+### Question Order (within every session)
+
+Questions follow this order. Skip a group only if it was already fully covered in a prior session.
+
+**1. General questions** (skip entirely if any prior session has covered these)
+
+Ask all 7 at once:
 
 1. **Who** are the users of this product?
 2. **What** does the product do at a high level?
@@ -88,113 +102,81 @@ Do not introduce topic labels or categories during active listening. The summary
 6. **Failure** — what does failure look like? What must never happen?
 7. **Out-of-scope** — what are we explicitly not building?
 
-**During the session**: Apply Level 1 active listening (paraphrase each answer). Apply CIT, Laddering, and CI Perspective Change per answer to surface gaps. Add new questions to the Questions table as they arise — do not defer to a later session.
-
-**After the session**:
-
-1. Write the **Session 1 Synthesis** in `discovery.md`: a 3–5 sentence summary of who the users are, what the product does, why it exists, its success/failure conditions, and explicit out-of-scope boundaries.
-2. Present the synthesis to the stakeholder: "Here is my understanding of what you told me — please correct anything that is missing or wrong."
-3. Stakeholder confirms or corrects. PO refines until approved.
-4. Run a **silent pre-mortem** on the confirmed synthesis: "Imagine we build exactly what was described, ship it, and it fails. What was missing?" Add any discoveries as new questions to the Questions table.
-5. Mark `Template §1: CONFIRMED` in `discovery.md`. This unlocks Session 2.
-
-### Session 2 — Behavior Groups / Big Picture
-
-**Before the session**: Review the confirmed Session 1 synthesis. Identify behavior groups (cross-cutting concerns, system-wide constraints, integration points, lifecycle questions). Prepare group-level questions.
-
-**During the session**: Apply Level 1 active listening per answer. Apply Level 2 active listening when transitioning between groups. Apply CIT, Laddering, and CI Perspective Change per group. Add new questions in the moment.
+Apply Level 1 active listening per answer. Apply CIT, Laddering, and CI Perspective Change per answer to surface gaps. Add new questions in the moment.
 
-**After the session**:
-
-1. For each group, write a **Group Summary** in `discovery.md`.
-2. Mark `Template §2: CONFIRMED` in `discovery.md`. This unlocks Session 3.
-
-### Session 3 — Synthesis Approval + Feature Derivation
-
-**Before the session**: Produce a **Full Synthesis** across all behavior groups from Sessions 1 and 2. Write it to `discovery.md`.
-
-**During the session**: Present the full synthesis to the stakeholder. "This is my understanding of the full scope. Please correct anything that is missing or wrong." Stakeholder approves or corrects. PO refines until the stakeholder explicitly approves.
-
-**After the session** (PO alone):
-
-1. Domain analysis: extract all nouns (candidate entities) and verbs (candidate operations) from the approved synthesis.
-2. Group nouns into subject areas (Bounded Contexts: where the same word means different things, a new context begins).
-3. Name each subject area as a feature using FDD "Action object" triples: "Calculate the total of a sale", "Validate the password of a user", "Enroll a student in a seminar".
-4. For each feature: create `docs/features/backlog/<name>.feature` using the feature file template (discovery section only — no Rules yet).
-5. Write `Status: BASELINED (YYYY-MM-DD)` to `discovery.md`.
-
-Commit: `feat(discovery): baseline project discovery`
-
----
+**2. Cross-cutting questions**
 
-## Phase 2 — Feature Discovery
+Target behavior groups, bounded contexts, integration points, lifecycle events, and system-wide constraints. Apply Level 2 active listening when transitioning between groups.
 
-**When**: Per feature, after project discovery is baselined. Each `.feature` file has its own 3-session discovery template in its description.
+**3. Feature questions** (one feature at a time)
 
-### Session 1 — Individual Entity Elicitation
+For each feature the session touches:
+- Extract relevant nouns and verbs from `docs/discovery.md` Domain Model (if it exists)
+- Generate questions from entity gaps: boundaries, edge cases, interactions, failure modes
+- Run a silent pre-mortem: "Imagine the developer builds this feature exactly as described, all tests pass, but the feature doesn't work for the user. What would be missing?"
+- Apply CIT, Laddering, and CI Perspective Change per question
 
-**Before the session**: Open `docs/features/backlog/<name>.feature`.
+**Real-time split rule**: if, during feature questions, the PO detects >2 distinct concerns OR >8 candidate Examples for a single feature, **split immediately**:
+1. Record the split in the journal: note the original feature name and the two new names
+2. Create stub `.feature` files for both parts (if they don't already exist)
+3. Continue feature questions for both new features in sequence within the same session
 
-1. **Populate the Entities table**: extract nouns (candidate classes) and verbs (candidate methods) from the project discovery synthesis that are relevant to this feature. Mark each as in-scope or not.
-2. **Generate questions from entity gaps**: for each in-scope entity, ask internally:
-   - What are its boundaries and edge cases?
-   - What happens when it is missing, invalid, or at its limits?
-   - How does it interact with other in-scope entities?
-3. Add questions to the Session 1 Questions table.
-4. Run a **silent pre-mortem**: "Imagine the developer builds this feature exactly as described, all tests pass, but the feature doesn't work for the user. What would be missing?" Add any discoveries as new questions.
+### After Questions (PO alone, same session)
 
-**During the session**: Apply Level 1 active listening per answer. Apply CIT, Laddering, and CI Perspective Change per answer. Add new questions in the moment.
+**Step A — Write answered Q&A to journal**
 
-**After the session**:
+Append all answered Q&A to `docs/discovery_journal.md`, in groups (general, cross-cutting, then per-feature). Write only answered questions. Unanswered questions are discarded.
 
-1. Write the **Session 1 Synthesis** in the `.feature` file: summarize the key entities, their relationships, and the constraints that emerged.
-2. Present the synthesis to the stakeholder. Stakeholder confirms or corrects. PO refines until approved.
-3. Run a **silent pre-mortem** on the confirmed synthesis.
-4. Mark `Template §1: CONFIRMED`. This unlocks Session 2.
+Group headers use this format:
+- General group: `### General`
+- Cross-cutting group: `### <Group Name>`
+- Feature group: `### Feature: <feature-stem>`
 
-### Session 2 — Behavior Groups / Big Picture for This Feature
+**Step B — Update .feature descriptions**
 
-**Before the session**: Review the confirmed Session 1 synthesis. Identify behavior groups within this feature (happy paths, error paths, edge cases, lifecycle events, integration points).
+For each feature touched in this session: rewrite the `.feature` file description to reflect the current state of understanding. Only touched features are updated; all others remain exactly as-is.
 
-**During the session**: Apply Level 1 active listening per answer. Apply Level 2 active listening when transitioning between groups. Apply CIT, Laddering, and CI Perspective Change per group.
+If a feature is new (just created as a stub): write its initial description now.
 
-**After the session**:
+**Step C — Append session synthesis to discovery.md (LAST)**
 
-1. Write **Group Summaries** in the `.feature` file. Name each group — these names become candidate `Rule:` titles.
-2. Mark `Template §2: CONFIRMED`. This unlocks Session 3.
+After all `.feature` files are updated, append one `## Session: YYYY-MM-DD` block to `docs/discovery.md`. The block contains:
+- `### Feature List` — which features were added or changed (0–N entries); if nothing changed, write "No changes"
+- `### Domain Model` — new or updated domain entities and verbs; if nothing changed, write "No changes"
+- `### Context` (first session only) — 3–5 sentence synthesis of who the users are, what the product does, why it exists, success/failure conditions, and explicit out-of-scope
 
-### Session 3 — Feature Synthesis Approval + Story Derivation
+**Step D — Mark session complete**
 
-**Before the session**: Produce a **Full Synthesis** of the feature scope, covering all behavior groups from Sessions 1 and 2.
-
-**During the session**: Present the full synthesis to the stakeholder. Stakeholder approves or corrects. PO refines until explicitly approved.
-
-**After the session** (PO alone):
-
-1. Map each named group from Session 2 to a candidate user story (Rule).
-2. Write `Status: BASELINED (YYYY-MM-DD)` to the `.feature` file's discovery section.
-3. Mark `Template §3: CONFIRMED`.
+Update the session header in `docs/discovery_journal.md`:
+```markdown
+## YYYY-MM-DD — Session N
+Status: COMPLETE
+```
 
-Commit: `feat(discovery): baseline <name> feature discovery`
+**Commit**: `feat(discovery): <one-sentence summary of session>`
 
-### Decomposition Check
+### Baselining a Feature
 
-After Session 3, before moving to Phase 3:
+A feature is baselined when the stakeholder has explicitly approved its discovery. The PO writes `Status: BASELINED (YYYY-MM-DD)` in the `.feature` file.
 
-Does this feature span **>2 distinct concerns** OR have **>8 candidate Examples**?
+**Gate**: a feature may only be baselined when:
+- Its description accurately reflects the stakeholder's approved understanding
+- Its candidate user stories (Rule candidates) are identified
+- The decomposition check passes: does not span >2 concerns AND does not have >8 candidate Examples
 
-- **YES** → split into separate `.feature` files in `backlog/`, each addressing a single cohesive concern. Re-run Phase 2 for any split feature that needs its own discovery.
-- **NO** → proceed to Phase 3.
+A baselined feature is ready for Stage 2. The PO may baseline features one at a time — not all at once.
 
 ---
 
-## Phase 3 — Stories
+## Stage 2 — Specification
 
-**When**: After feature discovery is baselined and decomposition check passes. PO works alone.
+Stage 2 runs per feature, after `Status: BASELINED`. PO works alone. No stakeholder involvement.
 
-### 3.1 Write Rule Blocks
+If the PO discovers a gap during Stage 2 that requires stakeholder input: stop Stage 2, open a new Stage 1 session, resolve the gap, then return to Stage 2.
 
-Clusters from Phase 2 Session 2 → one `Rule:` block per user story. Add after the discovery section in the `.feature` file.
+### Step A — Stories
+
+Derive `Rule:` blocks from the baselined feature description. One `Rule:` per user story.
 
 Each `Rule:` block contains:
 - The rule title (2-4 words, kebab-friendly)
@@ -217,9 +199,7 @@ Good stories are:
 
 Avoid: "As the system, I want..." (no business value). Break down stories that contain "and" into two Rules.
 
-### 3.2 INVEST Gate
-
-Before committing, verify every Rule passes:
+**INVEST Gate** — verify every Rule before committing:
 
 | Letter | Question | FAIL action |
 |---|---|---|
@@ -230,34 +210,25 @@ Before committing, verify every Rule passes:
 | **S**mall | Completable in one feature cycle? | Split into smaller Rules |
 | **T**estable | Can it be verified with a concrete test? | Rewrite with observable outcomes |
 
-### 3.3 Review Checklist
-
+**Review checklist:**
 - [ ] Every Rule has a distinct user role and benefit
 - [ ] No Rule duplicates another
-- [ ] Rules collectively cover all entities marked in-scope in the discovery section
+- [ ] Rules collectively cover all entities in scope from the feature description
 - [ ] Every Rule passes the INVEST gate
 
-Commit: `feat(stories): write user stories for <name>`
+Commit: `feat(stories): write user stories for <feature-stem>`
 
----
+### Step B — Criteria
 
-## Phase 4 — Criteria
+Add `Example:` blocks under each `Rule:`. PO writes all Examples alone, based on the approved feature description and domain knowledge. No stakeholder review of individual Examples.
 
-**When**: After stories are written. PO works alone.
-
-### 4.1 Silent Pre-mortem Per Rule
-
-For each `Rule:` block, ask internally before writing any Examples:
+**Silent pre-mortem per Rule** (before writing any Examples):
 
 > "What observable behaviors must we prove for this Rule to be complete?"
 
 All Rules must have their pre-mortems completed before any Examples are written.
 
-### 4.2 Write Example Blocks
-
-Add `Example:` blocks under each `Rule:`. Each Example gets an `@id:<8-char-hex>` tag.
-
-**Format** (mandatory):
+**Example format** (mandatory):
 
 ```gherkin
   Rule: Wall bounce
@@ -273,15 +244,12 @@ Add `Example:` blocks under each `Rule:`. Each Example gets an `@id:<8-char-hex>
 ```
 
 **Rules**:
-- `@id` tag on the line before `Example:`
 - `Example:` keyword (not `Scenario:`)
 - `Given/When/Then` in plain English
 - `Then` must be a single, observable, measurable outcome — no "and"
 - **Observable means observable by the end user**, not by a test harness
 - **Declarative, not imperative** — describe behavior, not UI steps
 - Each Example must be observably distinct from every other
-- If a single feature spans multiple concerns, split into separate `.feature` files
-- If user interaction is involved, the Feature description must declare the interaction model
 
 **Declarative vs. imperative Gherkin**:
 
@@ -300,11 +268,8 @@ Add `Example:` blocks under each `Rule:`. Each Example gets an `@id:<8-char-hex>
 - Examples that test implementation details ("Then: the Strategy pattern is used")
 - Imperative UI steps instead of declarative behavior descriptions
 
-### 4.3 Review Checklist
-
-Before committing:
+**Review checklist:**
 - [ ] Every `Rule:` block has at least one Example
-- [ ] Every `@id` is unique within this feature
 - [ ] Every Example has `Given/When/Then`
 - [ ] Every `Then` is a single, observable, measurable outcome
 - [ ] No Example tests implementation details
@@ -312,61 +277,68 @@ Before committing:
 - [ ] Each Example is observably distinct from every other
 - [ ] No single feature file spans multiple unrelated concerns
 
-### 4.4 Commit and Freeze
+**Self-Declaration (mandatory before criteria commit)**
 
-```bash
-git add docs/features/backlog/<name>.feature
-git commit -m "feat(criteria): write acceptance criteria for <name>"
-```
+Communicate verbally to the next agent. Every `DISAGREE` is a **hard blocker** — fix before committing. Do not commit until all items are AGREE or have a documented resolution.
+
+- INVEST-I: each Rule is Independent (no hidden ordering or dependency between Rules) — AGREE/DISAGREE | conflict:
+- INVEST-V: each Rule delivers Value to a named user — AGREE/DISAGREE | Rule:
+- INVEST-S: each Rule is Small enough for one development cycle — AGREE/DISAGREE | Rule:
+- INVEST-T: each Rule is Testable (I can write a pass/fail Example for it) — AGREE/DISAGREE | Rule:
+- Observable: every Then is a single, observable, measurable outcome — AGREE/DISAGREE | file:line
+- No impl details: no Example tests internal state or implementation — AGREE/DISAGREE | file:line
+- Coverage: every entity in the feature description appears in at least one Rule — AGREE/DISAGREE | missing:
+- Distinct: no two Examples test the same observable behavior — AGREE/DISAGREE | file:line
+- Pre-mortem: I ran a pre-mortem on each Rule and found no hidden failure modes — AGREE/DISAGREE | Rule:
+- Scope: no Example introduces behavior outside the feature boundary — AGREE/DISAGREE | file:line
+
+Commit: `feat(criteria): write acceptance criteria for <feature-stem>`
 
-**After this commit, the `Example:` blocks are frozen.** Any change requires:
+**After this commit, `Example:` blocks are frozen.** Any change requires:
 1. Add `@deprecated` tag to the old Example
-2. Write a new Example with a new `@id`
+2. Write a new Example (the `@id` tag will be assigned automatically)
+
+---
+
+## Bug Handling
+
+When a defect is reported against a completed or in-progress feature:
+
+1. **PO** adds a new Example to the relevant `Rule:` block in the `.feature` file:
+
+   ```gherkin
+   @bug
+   Example: <what the bug is>
+     Given <conditions that trigger the bug>
+     When <action>
+     Then <correct behavior>
+   ```
+
+2. **SE** implements the specific test in `tests/features/<feature_slug>/` (the `@id` test).
+3. **SE** also writes a `@given` Hypothesis property test in `tests/unit/` covering the whole class of inputs that triggered the bug — not just the single case.
+4. Both tests are required — neither is optional.
+5. SE follows the normal TDD loop (Step 3) for the new `@id`.
 
 ---
 
 ## Feature File Format
 
-Each feature is a single `.feature` file. The free-form description before the first `Rule:` contains all discovery content. Architecture is added later by the developer (Step 2).
+Each feature is a single `.feature` file. The description block contains the feature description and Status. All Q&A belongs in `docs/discovery_journal.md`; all architectural decisions belong in `docs/architecture.md`.
 
 ```gherkin
 Feature: <Feature title>
 
-  Discovery:
+  <2–4 sentence description of what this feature does and why it exists.
+  Written in plain language, always kept current by the PO.>
 
   Status: ELICITING | BASELINED (YYYY-MM-DD)
 
-  Entities:
-  | Type | Name | Candidate Class/Method | In Scope |
-  |------|------|----------------------|----------|
-  | Noun | Ball | Ball                 | Yes      |
-  | Verb | Bounce | Ball.bounce()      | Yes      |
-
   Rules (Business):
   - <Business rule that applies across multiple Examples>
 
   Constraints:
   - <Non-functional requirement specific to this feature>
 
-  Session 1 — Individual Entity Elicitation:
-  | ID | Question | Answer | Status |
-  |----|----------|--------|--------|
-  | Q1 | ... | ... | OPEN / ANSWERED |
-  Template §1: CONFIRMED
-  Synthesis: <PO synthesis — confirmed by stakeholder>
-
-  Session 2 — Behavior Groups / Big Picture:
-  | ID | Question | Answer | Status |
-  |----|----------|--------|--------|
-  | Q2 | ... | ... | OPEN / ANSWERED |
-  Template §2: CONFIRMED
-  Behavior Groups:
-  - <Behavior group name>: <one-sentence summary>
-
-  Session 3 — Feature Synthesis:
-  Synthesis: <full synthesis across all behavior groups>
-  Template §3: CONFIRMED — stakeholder approved YYYY-MM-DD
-
   Rule: <User story title>
     As a <role>
     I want <goal>
@@ -389,41 +361,115 @@ The **Rules (Business)** section captures business rules that hold across multip
 
 The **Constraints** section captures non-functional requirements. Testable constraints should become `Example:` blocks with `@id` tags.
 
+What is **not** in `.feature` files:
+- Entities table — domain model lives in `docs/discovery.md`
+- Session Q&A blocks — live in `docs/discovery_journal.md`
+- Template §N markers — live in `docs/discovery_journal.md` session blocks
+- Architecture section — lives in `docs/architecture.md`
+
+---
+
+## Project-Level Discovery Templates
+
+Three files hold project-level discovery content. Use these templates when creating them for the first time.
+
+### `docs/discovery_journal.md` — Raw Q&A (append-only)
+
+```markdown
+# Discovery Journal: <project-name>
+
 ---
 
-## Project-Level Discovery (`docs/features/discovery.md`)
+## YYYY-MM-DD — Session 1
+Status: IN-PROGRESS
+
+### General
+
+| ID | Question | Answer |
+|----|----------|--------|
+| Q1 | Who are the users? | ... |
+| Q2 | What does the product do at a high level? | ... |
+| Q3 | Why does it exist — what problem does it solve? | ... |
+| Q4 | When and where is it used? | ... |
+| Q5 | Success — what does "done" look like? | ... |
+| Q6 | Failure — what must never happen? | ... |
+| Q7 | Out-of-scope — what are we explicitly not building? | ... |
+
+### <Group Name>
+
+| ID | Question | Answer |
+|----|----------|--------|
+| Q8 | ... | ... |
+
+### Feature: <feature-stem>
+
+| ID | Question | Answer |
+|----|----------|--------|
+| Q9 | ... | ... |
+
+Status: COMPLETE
+```
+
+Rules:
+- Session header written first with `Status: IN-PROGRESS` before any Q&A
+- Only answered questions are written; unanswered questions are discarded
+- Questions grouped by topic (general, cross-cutting groups, per-feature)
+- `Status: COMPLETE` written at the end of the session block, after all writes are done
+- Never edit past entries — only append new session blocks
+
+### `docs/discovery.md` — Synthesis Changelog (append-only)
 
 ```markdown
 # Discovery: <project-name>
 
-## State
-Status: ELICITING | BASELINED (YYYY-MM-DD)
+---
+
+## Session: YYYY-MM-DD
 
-## Session 1 — Individual Scope Elicitation
+### Context
+<3–5 sentence synthesis of who the users are, what the product does, why it exists,
+success/failure conditions, and out-of-scope boundaries.>
+(First session only. Omit this subsection in subsequent sessions.)
 
-| ID | Question | Answer | Status |
-|----|----------|--------|--------|
-| Q1 | Who are the users? | ... | OPEN / ANSWERED |
+### Feature List
+- `<feature-stem>` — <one-sentence description of what changed or was added>
+(Write "No changes" if no features were added or modified this session.)
 
-Template §1: CONFIRMED
-Synthesis: <PO synthesis — confirmed by stakeholder>
-Pre-mortem: <gaps identified; new questions added above>
+### Domain Model
+| Type | Name | Description | In Scope |
+|------|------|-------------|----------|
+| Noun | <name> | <description> | Yes |
+| Verb | <name> | <description> | Yes |
+(Write "No changes" if domain model was not updated this session.)
+```
 
-## Session 2 — Behavior Groups / Big Picture
+Rules:
+- Each session appends one `## Session: YYYY-MM-DD` block
+- Synthesis block is written LAST — only after all `.feature` file descriptions are updated
+- No project-level `Status: BASELINED` — feature-level BASELINED in `.feature` files is the gate
+- Never edit past blocks — append only; later blocks extend or supersede earlier ones
 
-| ID | Question | Answer | Status |
-|----|----------|--------|--------|
-| Q2 | ... | ... | OPEN / ANSWERED |
+### `docs/architecture.md` — Architectural Decisions (append-only, software-engineer)
 
-Template §2: CONFIRMED
-Behavior Groups:
-- <Behavior group name>: <one-sentence summary>
+```markdown
+# Architecture: <project-name>
 
-## Session 3 — Full Synthesis
+---
 
-<3–6 paragraph synthesis of all scope, behavior groups, and boundaries>
+## YYYY-MM-DD — <feature-stem>: <short title>
 
-Template §3: CONFIRMED — stakeholder approved YYYY-MM-DD
+Decision: <what was decided — one sentence>
+Reason: <why — one sentence>
+Alternatives considered: <what was rejected and why>
+Feature: <feature-stem>
 ```
 
-No Entities table at project level.
+Rules: Append-only. When a decision changes, append a new block that supersedes the old one. Cross-feature decisions use `Cross-feature:` in the header. Only write a block for non-obvious decisions with meaningful trade-offs.
+
+Base directory for this skill: file:///home/user/Documents/projects/python-project-template/.opencode/skills/scope
+Relative paths in this skill (e.g., scripts/, reference/) are relative to this base directory.
+Note: file list is sampled.
+
+<skill_files>
+<file>/home/user/Documents/projects/python-project-template/.opencode/skills/scope/discovery-template.md</file>
+</skill_files>
diff --git a/.opencode/skills/scope/discovery-template.md b/.opencode/skills/scope/discovery-template.md
index 117d025..aa4cc5c 100644
--- a/.opencode/skills/scope/discovery-template.md
+++ b/.opencode/skills/scope/discovery-template.md
@@ -1,33 +1,9 @@
 Feature: <feature-name>
 
-  Discovery:
+  <2–4 sentence description of what this feature does and why it exists.>
 
   Status: ELICITING
 
-  Entities:
-  | Type | Name | Candidate Class/Method | In Scope |
-  |------|------|----------------------|----------|
-
   Rules (Business):
 
   Constraints:
-
-  Session 1 — Individual Entity Elicitation:
-  | ID | Question | Answer | Status |
-  |----|----------|--------|--------|
-
-  Template §1: PENDING
-  Synthesis: (fill after stakeholder confirms)
-  Pre-mortem: (fill after synthesis is confirmed)
-
-  Session 2 — Behavior Groups / Big Picture:
-  | ID | Question | Answer | Status |
-  |----|----------|--------|--------|
-
-  Template §2: PENDING
-  Behavior Groups:
-  - (fill after all group questions are answered)
-
-  Session 3 — Feature Synthesis:
-  (fill after Sessions 1 and 2 are complete)
-  Template §3: PENDING
diff --git a/.opencode/skills/session-workflow/SKILL.md b/.opencode/skills/session-workflow/SKILL.md
index 658023f..0281f2c 100644
--- a/.opencode/skills/session-workflow/SKILL.md
+++ b/.opencode/skills/session-workflow/SKILL.md
@@ -19,15 +19,20 @@ Every session starts by reading state. Every session ends by writing state. This
      # Current Work
 
      No feature in progress.
-     Next: PO picks feature from docs/features/backlog/ and moves it to docs/features/in-progress/.
+     Next: Run @product-owner — load skill feature-selection and pick the next BASELINED feature from backlog.
      ```
-2. If a feature is active, read:
-   - `docs/features/in-progress/<name>.feature` — feature file (discovery + architecture + Rules + Examples)
-   - `docs/features/discovery.md` — project-level discovery (for context)
-3. Run `git status` — understand what is committed vs. what is not
-4. Confirm scope: you are working on exactly one step of one feature
+2. **If you are the PO** and Step 1 (SCOPE) is active: check `docs/discovery_journal.md` for the most recent session block.
+   - If the most recent block has `Status: IN-PROGRESS` → the previous session was interrupted. Resume it before starting a new session: finish updating `.feature` files and `docs/discovery.md`, then mark the block `Status: COMPLETE`.
+3. If a feature is active at Step 2–5, read:
+   - `docs/features/in-progress/<feature-stem>.feature` — feature file (Rules + Examples + @id)
+   - `docs/discovery.md` — project-level synthesis changelog (for context)
+4. Run `git status` — understand what is committed vs. what is not
+5. Confirm scope: you are working on exactly one step of one feature
 
-If TODO.md says "No feature in progress", load `skill feature-selection` — it guides the PO through scoring and selecting the next BASELINED backlog feature. **The software-engineer never self-selects a feature from the backlog — only the PO picks.** The PO must verify the feature has `Status: BASELINED` in its discovery section before moving it to `in-progress/` — if not baselined, the PO must complete Step 1 first.
+**If TODO.md says "No feature in progress":**
+
+- **PO**: Load `skill feature-selection` — it guides you through scoring and selecting the next BASELINED backlog feature. You must verify the feature has `Status: BASELINED` before moving it to `in-progress/`. Only you may move it.
+- **Software-engineer or reviewer**: Update TODO.md `Next:` line to `Run @product-owner — load skill feature-selection and pick the next BASELINED feature from backlog.` Then **stop**. Never self-select a feature. Never move a `.feature` file.
 
 ## Session End
 
@@ -35,13 +40,12 @@ If TODO.md says "No feature in progress", load `skill feature-selection` — it
    - Mark completed criteria `[x]`
    - Mark in-progress criteria `[~]`
    - Update the "Next" line with one concrete action
-2. Run `uv run task gen-todo` to sync any new @id rows from .feature files into TODO.md.
-3. Commit any uncommitted work (even WIP):
+2. Commit any uncommitted work (even WIP):
    ```bash
    git add -A
-   git commit -m "WIP(<feature-name>): <what was done>"
+   git commit -m "WIP(<feature-stem>): <what was done>"
    ```
-4. If a step is fully complete, use the proper commit message instead of WIP.
+3. If a step is fully complete, use the proper commit message instead of WIP.
 
 ## Step Completion Protocol
 
@@ -51,7 +55,7 @@ When a step completes within a session:
 2. Commit the TODO.md update:
    ```bash
    git add TODO.md
-   git commit -m "chore: complete step <N> for <feature-name>"
+   git commit -m "chore: complete step <N> for <feature-stem>"
    ```
 3. Only then begin the next step (in a new session where possible — see Rule 4).
 
@@ -60,9 +64,9 @@ When a step completes within a session:
 ```markdown
 # Current Work
 
-Feature: <name>
+Feature: <feature-stem>
 Step: <1-5> (<step name>)
-Source: docs/features/in-progress/<name>.feature
+Source: docs/features/in-progress/<feature-stem>.feature
 
 ## Progress
 - [x] `@id:<hex>`: <description>
@@ -75,15 +79,15 @@ Run @<agent-name> — <one concrete action>
 
 **"Next" line format**: Always prefix with `Run @<agent-name>` so the human knows exactly which agent to invoke. Agent names are defined in `AGENTS.md` — use the name exactly as listed there. Examples:
 - `Run @<software-engineer-agent> — implement @id:a1b2c3d4 (Step 3 RED)`
-- `Run @<software-engineer-agent> — load skill implementation and begin Step 2 (Architecture) for <feature-name>`
-- `Run @<reviewer-agent> — verify feature <feature-name> at Step 4`
+- `Run @<software-engineer-agent> — load skill implementation and begin Step 2 (Architecture) for <feature-stem>`
+- `Run @<reviewer-agent> — verify feature <feature-stem> at Step 4`
 - `Run @<product-owner-agent> — pick next BASELINED feature from backlog`
-- `Run @<product-owner-agent> — accept feature <feature-name> at Step 5`
+- `Run @<product-owner-agent> — accept feature <feature-stem> at Step 5`
 
 **Source path by step:**
-- Step 1: `Source: docs/features/backlog/<name>.feature`
-- Steps 2–4: `Source: docs/features/in-progress/<name>.feature`
-- Step 5: `Source: docs/features/completed/<name>.feature`
+- Step 1: `Source: docs/features/backlog/<feature-stem>.feature`
+- Steps 2–4: `Source: docs/features/in-progress/<feature-stem>.feature`
+- Step 5: `Source: docs/features/completed/<feature-stem>.feature`
 
 Status markers:
 - `[ ]` — not started
@@ -103,22 +107,17 @@ Next: Run @<product-owner-agent> — load skill feature-selection and pick the n
 
 During Step 3 (TDD Loop), TODO.md **must** include a `## Cycle State` block to track Red-Green-Refactor progress.
 
-When `Phase: REFACTOR` is complete, a `## Self-Declaration` block is also **mandatory** before handing off to Step 4.
-
 ```markdown
 # Current Work
 
-Feature: <name>
+Feature: <feature-stem>
 Step: 3 (TDD Loop)
-Source: docs/features/in-progress/<name>.feature
+Source: docs/features/in-progress/<feature-stem>.feature
 
 ## Cycle State
 Test: `@id:<hex>` — <description>
 Phase: RED | GREEN | REFACTOR
 
-## Self-Declaration
-As a software-engineer I declare this code follows YAGNI-1 ... (full checklist in implementation/SKILL.md)
-
 ## Progress
 - [x] `@id:<hex>`: <description>
 - [~] `@id:<hex>`: <description>          ← in progress (see Cycle State)
@@ -134,23 +133,6 @@ As a software-engineer I declare this code follows YAGNI-1 ... (full checklist i
 - Move from `GREEN` → `REFACTOR` when the test passes
 - Move from `REFACTOR` → mark `@id` complete in `## Progress` when test-fast passes
 
-## gen-todo Script
-
-`uv run task gen-todo` keeps TODO.md in sync with `.feature` files:
-
-```bash
-uv run task gen-todo              # merge-write: add missing @id rows, preserve existing status
-uv run task gen-todo -- --check   # dry run — report what would change
-```
-
-**Merge rules:**
-- Adds any `@id` rows from in-progress `.feature` files that are missing in `## Progress`
-- Never removes or downgrades existing `[x]`, `[~]`, `[-]` rows
-- Preserves the `Step:` field and `## Next` line from the current TODO.md
-- If no feature is in-progress, writes the "No feature in progress" format
-
-Run `gen-todo` at session start (after reading TODO.md) and at session end (before committing).
-
 ## Rules
 
 1. Never skip reading TODO.md at session start
@@ -160,4 +142,3 @@ Run `gen-todo` at session start (after reading TODO.md) and at session end (befo
 5. The "Next" line must be actionable enough that a fresh AI can execute it without asking questions
 6. During Step 3, always update `## Cycle State` when transitioning between RED/GREEN/REFACTOR phases
 7. When a step completes, update TODO.md and commit **before** any further work
-8. During Step 3, write the `## Self-Declaration` block into TODO.md after all quality gates pass — every claim must have AGREE/DISAGREE with `file:line` evidence
diff --git a/.opencode/skills/session-workflow/scripts/gen_todo.py b/.opencode/skills/session-workflow/scripts/gen_todo.py
deleted file mode 100644
index 980df1e..0000000
--- a/.opencode/skills/session-workflow/scripts/gen_todo.py
+++ /dev/null
@@ -1,371 +0,0 @@
-"""Generate and sync the TODO.md session bookmark from .feature files.
-
-Reads the in-progress .feature file (or reports none if not present),
-merges missing @id rows into the existing TODO.md, and writes the result.
-
-Modes:
-    uv run task gen-todo              Merge-write TODO.md (default)
-    uv run task gen-todo -- --check   Dry run — show what would change
-
-Merge rules:
-    - Adds @id rows that are in .feature files but missing from TODO.md
-    - Never removes or downgrades existing [x], [~], [-] rows
-    - Updates the Feature/Step/Source header from the in-progress file
-    - If no feature is in-progress, writes the "No feature in progress" format
-"""
-
-from __future__ import annotations
-
-import re
-import sys
-from dataclasses import dataclass
-from pathlib import Path
-
-PROJECT_ROOT = Path(__file__).resolve().parents[4]
-FEATURES_DIR = PROJECT_ROOT / "docs" / "features"
-TODO_PATH = PROJECT_ROOT / "TODO.md"
-
-PROGRESS_ROW_RE = re.compile(r"^- \[(?P<status>[x~\- ])\] `@id:(?P<id>[a-f0-9]{8})`")
-ID_TAG_RE = re.compile(r"@id:([a-f0-9]{8})")
-EXAMPLE_RE = re.compile(r"^\s*Example:\s*(.+)$")
-DEPRECATED_TAG_RE = re.compile(r"@deprecated")
-
-
-@dataclass(frozen=True, slots=True)
-class Criterion:
-    """One acceptance criterion extracted from a .feature file."""
-
-    id_hex: str
-    title: str
-    deprecated: bool
-
-
-def find_in_progress_feature() -> tuple[str, Path] | None:
-    """Find the single .feature file currently in docs/features/in-progress/.
-
-    Returns:
-        Tuple of (feature_name, feature_file_path) or None if nothing is in progress.
-        feature_name is the .feature file stem (e.g. 'display-version').
-    """
-    in_progress = FEATURES_DIR / "in-progress"
-    if not in_progress.exists():
-        return None
-    feature_files = [
-        f for f in in_progress.iterdir() if f.is_file() and f.suffix == ".feature"
-    ]
-    if not feature_files:
-        return None
-    feature_file = feature_files[0]
-    return feature_file.stem, feature_file
-
-
-def find_backlog_features() -> list[str]:
-    """List feature names in docs/features/backlog/.
-
-    Returns:
-        Sorted list of .feature file stems.
-    """
-    backlog = FEATURES_DIR / "backlog"
-    if not backlog.exists():
-        return []
-    return sorted(
-        f.stem for f in backlog.iterdir() if f.is_file() and f.suffix == ".feature"
-    )
-
-
-def extract_criteria(feature_path: Path) -> list[Criterion]:
-    """Extract all @id-tagged Examples from a single .feature file.
-
-    Args:
-        feature_path: Path to the .feature file.
-
-    Returns:
-        Ordered list of Criterion objects (deprecated ones included).
-    """
-    return _parse_feature_file(feature_path)
-
-
-def _parse_feature_file(path: Path) -> list[Criterion]:
-    """Parse a single .feature file for @id-tagged Examples.
-
-    Args:
-        path: Path to the .feature file.
-
-    Returns:
-        List of Criterion objects found in this file.
-    """
-    lines = path.read_text(encoding="utf-8").splitlines()
-    criteria: list[Criterion] = []
-    i = 0
-    while i < len(lines):
-        line = lines[i]
-        id_match = ID_TAG_RE.search(line)
-        if id_match:
-            id_hex = id_match.group(1)
-            deprecated = bool(DEPRECATED_TAG_RE.search(line))
-            title = _find_example_title(lines, i + 1)
-            criteria.append(
-                Criterion(id_hex=id_hex, title=title, deprecated=deprecated)
-            )
-        i += 1
-    return criteria
-
-
-def _find_example_title(lines: list[str], start: int) -> str:
-    """Scan forward from start to find the Example: title line.
-
-    Args:
-        lines: All lines from the .feature file.
-        start: Index to start scanning from.
-
-    Returns:
-        The Example title string, or empty string if not found.
-    """
-    for i in range(start, min(start + 5, len(lines))):
-        m = EXAMPLE_RE.match(lines[i])
-        if m:
-            return m.group(1).strip()
-    return ""
-
-
-def read_existing_progress(todo_text: str) -> dict[str, str]:
-    """Extract existing @id rows and their status from TODO.md content.
-
-    Args:
-        todo_text: Full content of current TODO.md.
-
-    Returns:
-        Dict mapping id_hex -> status character ('x', '~', '-', ' ').
-    """
-    existing: dict[str, str] = {}
-    for line in todo_text.splitlines():
-        m = PROGRESS_ROW_RE.match(line)
-        if m:
-            existing[m.group("id")] = m.group("status")
-    return existing
-
-
-def build_progress_lines(
-    criteria: list[Criterion],
-    existing: dict[str, str],
-) -> list[str]:
-    """Build the ## Progress section lines, merging new with existing.
-
-    Args:
-        criteria: All criteria from .feature files (in order).
-        existing: Existing @id -> status mapping from current TODO.md.
-
-    Returns:
-        List of progress row strings (without trailing newline).
-    """
-    lines = []
-    for c in criteria:
-        status = existing.get(c.id_hex, " ")
-        label = c.title or "(no title)"
-        suffix = " — DEPRECATED" if c.deprecated else ""
-        lines.append(f"- [{status}] `@id:{c.id_hex}`: {label}{suffix}")
-    return lines
-
-
-def build_todo_content(
-    feature_name: str,
-    step: str,
-    source: str,
-    progress_lines: list[str],
-    next_action: str,
-) -> str:
-    """Assemble the full TODO.md content.
-
-    Args:
-        feature_name: Display name of the current feature.
-        step: Current step number and name, e.g. '4 (implement)'.
-        source: Path to discovery.md.
-        progress_lines: The ## Progress rows.
-        next_action: The ## Next one-liner.
-
-    Returns:
-        Full TODO.md content string.
-    """
-    lines = [
-        "# Current Work",
-        "",
-        f"Feature: {feature_name}",
-        f"Step: {step}",
-        f"Source: {source}",
-        "",
-        "## Progress",
-        *progress_lines,
-        "",
-        "## Next",
-        next_action,
-        "",
-    ]
-    return "\n".join(lines)
-
-
-def build_empty_todo() -> str:
-    """Build the 'No feature in progress' TODO.md content.
-
-    Returns:
-        Minimal TODO.md content string.
-    """
-    return "\n".join(
-        [
-            "# Current Work",
-            "",
-            "No feature in progress.",
-            "Next: PO picks feature from docs/features/backlog/ and moves it to"
-            " docs/features/in-progress/.",
-            "",
-        ]
-    )
-
-
-def _extract_header_field(todo_text: str, field: str) -> str:
-    """Extract a header field value from existing TODO.md.
-
-    Args:
-        todo_text: Full TODO.md content.
-        field: Field name to look for (e.g. 'Step', 'Feature').
-
-    Returns:
-        The value string, or empty string if not found.
-    """
-    pattern = re.compile(rf"^{field}:\s*(.+)$", re.MULTILINE)
-    m = pattern.search(todo_text)
-    return m.group(1).strip() if m else ""
-
-
-def _extract_next_action(todo_text: str) -> str:
-    """Extract the ## Next line from existing TODO.md.
-
-    Args:
-        todo_text: Full TODO.md content.
-
-    Returns:
-        The Next action string, or a placeholder.
-    """
-    lines = todo_text.splitlines()
-    for i, line in enumerate(lines):
-        if line.strip() == "## Next" and i + 1 < len(lines) and lines[i + 1].strip():
-            return lines[i + 1].strip()
-    return "<fill in next action>"
-
-
-def _sync_no_feature(*, check_only: bool) -> int:
-    """Handle sync when no feature is in progress.
-
-    Args:
-        check_only: If True, report changes without writing.
-
-    Returns:
-        Exit code: 0 = in sync or wrote successfully, 1 = changes needed (check mode).
-    """
-    new_content = build_empty_todo()
-    existing = TODO_PATH.read_text(encoding="utf-8") if TODO_PATH.exists() else ""
-    if existing.strip() == new_content.strip():
-        print("TODO.md is in sync.")
-        return 0
-    if check_only:
-        print("TODO.md would be updated: no feature in progress format.")
-        return 1
-    TODO_PATH.write_text(new_content, encoding="utf-8")
-    print("TODO.md updated: no feature in progress.")
-    return 0
-
-
-def _write_or_report(
-    new_content: str,
-    new_ids: set[str],
-    criteria: list[Criterion],
-    *,
-    check_only: bool,
-) -> int:
-    """Write updated TODO.md or report what would change.
-
-    Args:
-        new_content: The new TODO.md content to write.
-        new_ids: Set of @id hex values that are new (not in existing TODO.md).
-        criteria: All criteria from .feature files.
-        check_only: If True, report changes without writing.
-
-    Returns:
-        Exit code: 0 = wrote successfully, 1 = changes needed (check mode).
-    """
-    if check_only:
-        if new_ids:
-            print(f"TODO.md would add {len(new_ids)} new @id row(s):")
-            for c in criteria:
-                if c.id_hex in new_ids:
-                    print(f"  [ ] @id:{c.id_hex}: {c.title}")
-        else:
-            print("TODO.md header or structure would be updated.")
-        return 1
-    TODO_PATH.write_text(new_content, encoding="utf-8")
-    if new_ids:
-        print(f"TODO.md updated: added {len(new_ids)} new @id row(s).")
-        for c in criteria:
-            if c.id_hex in new_ids:
-                print(f"  [ ] @id:{c.id_hex}: {c.title}")
-    else:
-        print("TODO.md updated.")
-    return 0
-
-
-def sync_todo(*, check_only: bool = False) -> int:
-    """Main sync logic: read feature state, merge TODO.md, write if changed.
-
-    Args:
-        check_only: If True, report changes without writing.
-
-    Returns:
-        Exit code: 0 = in sync or wrote successfully, 1 = changes needed (check mode).
-    """
-    result = find_in_progress_feature()
-
-    if result is None:
-        return _sync_no_feature(check_only=check_only)
-
-    feature_name, feature_path = result
-    criteria = extract_criteria(feature_path)
-
-    existing_text = TODO_PATH.read_text(encoding="utf-8") if TODO_PATH.exists() else ""
-    existing_progress = read_existing_progress(existing_text)
-
-    step = (
-        _extract_header_field(existing_text, "Step") or "? (unknown — update manually)"
-    )
-    source = f"docs/features/in-progress/{feature_name}.feature"
-    next_action = _extract_next_action(existing_text)
-
-    progress_lines = build_progress_lines(criteria, existing_progress)
-    new_content = build_todo_content(
-        feature_name=feature_name,
-        step=step,
-        source=source,
-        progress_lines=progress_lines,
-        next_action=next_action,
-    )
-
-    existing_ids = set(existing_progress.keys())
-    new_ids = {c.id_hex for c in criteria} - existing_ids
-
-    if existing_text.strip() == new_content.strip():
-        print("TODO.md is in sync.")
-        return 0
-
-    return _write_or_report(new_content, new_ids, criteria, check_only=check_only)
-
-
-def main() -> int:
-    """Entry point for the gen-todo command.
-
-    Returns:
-        Exit code (0 = success, 1 = changes needed in check mode).
-    """
-    check_only = "--check" in sys.argv
-    return sync_todo(check_only=check_only)
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())
diff --git a/.opencode/skills/verify/SKILL.md b/.opencode/skills/verify/SKILL.md
index 7d145f2..3d5c449 100644
--- a/.opencode/skills/verify/SKILL.md
+++ b/.opencode/skills/verify/SKILL.md
@@ -1,7 +1,7 @@
 ---
 name: verify
 description: Step 4 — run all verification commands, review code quality, and produce a written report
-version: "3.0"
+version: "4.0"
 author: reviewer
 audience: reviewer
 workflow: feature-lifecycle
@@ -15,9 +15,9 @@ This skill guides the reviewer through Step 4: independent verification that the
 
 **Every PASS/FAIL cell must have evidence.** Empty evidence = UNCHECKED = REJECTED.
 
-## When to Use (Step 4)
+**You never move `.feature` files.** After producing an APPROVED report: update TODO.md `Next:` to `Run @product-owner — accept feature <name> at Step 5.` then stop. The PO accepts the feature and moves the file.
 
-After the software-engineer signals Step 3 is complete and all self-verification checks pass. Do not start verification until the software-engineer has committed all work and written the Self-Declaration.
+The reviewer produces one written report (see template below) that includes: all gate results, the SE Self-Declaration Audit, the **Reviewer Stance Declaration**, and the final APPROVED/REJECTED verdict. Do not start until the software-engineer has committed all work and communicated the Self-Declaration verbally in the handoff message.
 
 ## Step-by-Step
 
@@ -26,8 +26,8 @@ After the software-engineer signals Step 3 is complete and all self-verification
 Read `docs/features/in-progress/<name>.feature`. Extract:
 - All `@id` tags and their Example titles from `Rule:` blocks
 - The interaction model (if the feature involves user interaction)
-- The Architecture section (module structure, ADRs)
-- The software-engineer's Self-Declaration from `TODO.md`
+- The architectural decisions in `docs/architecture.md` relevant to this feature
+- The software-engineer's Self-Declaration (communicated verbally in the handoff message)
 
 ### 2. pyproject.toml Gate
 
@@ -58,13 +58,30 @@ Run before code review. If any row is FAIL, stop immediately with REJECTED.
 | App exits cleanly | `timeout 10s uv run task run` | Exit 0 or non-124 | Exit 124 (timeout/hang) | Fix the hang |
 | Output changes when input changes | Run app, change an input or condition, observe output | Output changes accordingly | Output is static | Implement real logic |
 
-### 5. Code Review
+### 5. Self-Declaration Audit
+
+**Completeness check (hard gate — REJECT if failed)**: Count the numbered items in the SE's Self-Declaration. The template in `implementation/SKILL.md` has exactly 25 items numbered 1–25. If the count is not 25, or any number in the sequence 1–25 is missing, REJECT immediately — do not proceed to item-level audit.
+
+Read the software-engineer's Self-Declaration from the handoff message.
+
+For every **AGREE** claim:
+- Find the `file:line` — does it hold?
+
+For every **DISAGREE** claim:
+- Read the justification carefully.
+- If the constraint genuinely falls outside the SE's control (e.g. external library forces method chaining, dataclass/Pydantic/TypedDict exemption for ≤2 ivars): accept with a note in the report and suggest the closest compliant alternative if one exists.
+- If the justification is weak, incomplete, or a best-practice alternative exists that the SE did not consider: REJECT with the specific alternative stated.
+- If there is no justification: REJECT.
+
+Undeclared violations found during code review → REJECT.
+
+### 6. Code Review
 
 Read the source files changed in this feature. **Do this before running lint/static-check/test** — if code review finds a design problem, commands will need to re-run after the fix anyway.
 
 **Stop on first failure category — do not accumulate issues.**
 
-#### 5a. Correctness — any FAIL → REJECTED
+#### 6a. Correctness — any FAIL → REJECTED
 
 | Check | How to check | PASS | FAIL | Fix |
 |---|---|---|---|---|
@@ -72,7 +89,7 @@ Read the source files changed in this feature. **Do this before running lint/sta
 | No duplicate logic (DRY) | Search for repeated blocks doing the same thing | None found | Duplication found | Extract to shared function |
 | No over-engineering (YAGNI) | Check for abstractions with no current use | None found | Unused abstraction | Remove unused code |
 
-#### 5b. Simplicity (KISS) — any FAIL → REJECTED
+#### 6b. Simplicity (KISS) — any FAIL → REJECTED
 
 | Check | How to check | PASS | FAIL | Fix |
 |---|---|---|---|---|
@@ -81,7 +98,7 @@ Read the source files changed in this feature. **Do this before running lint/sta
 | Functions ≤ 20 lines | Count lines | ≤ 20 | > 20 | Extract helper |
 | Classes ≤ 50 lines | Count lines | ≤ 50 | > 50 | Split class |
 
-#### 5c. SOLID — any FAIL → REJECTED
+#### 6c. SOLID — any FAIL → REJECTED
 
 | Principle | Why it matters | What to check | How to check |
 |---|---|---|---|
@@ -91,21 +108,11 @@ Read the source files changed in this feature. **Do this before running lint/sta
 | ISP | Fat interfaces force unused methods | No Protocol forces stub implementations | Check for NotImplementedError |
 | DIP | Concrete I/O makes unit testing impossible | High-level depends on abstractions | Check domain imports no I/O/DB |
 
-#### 5d. Object Calisthenics — any FAIL → REJECTED
+#### 6d. Object Calisthenics — any FAIL → REJECTED
 
-| # | Rule | How to check |
-|---|---|---|
-| 1 | One indent level per method | Count max nesting |
-| 2 | No `else` after `return` | Search for `else` after early returns |
-| 3 | Primitives wrapped | Bare `int`/`str` in domain signatures = FAIL |
-| 4 | Collections wrapped | `list[X]` as domain value = FAIL |
-| 5 | One dot per line | `a.b.c()` chains = FAIL |
-| 6 | No abbreviations | `mgr`, `tmp`, `calc` = FAIL |
-| 7 | Small entities | Functions > 20 lines or classes > 50 lines = FAIL |
-| 8 | ≤ 2 instance variables | Count `self.x` in `__init__` — >2 = FAIL |
-| 9 | No getters/setters | `get_x()`/`set_x()` = FAIL |
-
-#### 5e. Design Patterns — any FAIL → REJECTED
+Load `skill design-patterns` and apply the full OC checklist (9 rules). Record a PASS/FAIL with `file:line` evidence for each rule. Rules 1 and 7 (nesting and entity size) share thresholds with 6b above.
+
+#### 6e. Design Patterns — any FAIL → REJECTED
 
 | Code smell | Pattern missed | How to check |
 |---|---|---|
@@ -115,7 +122,7 @@ Read the source files changed in this feature. **Do this before running lint/sta
 | External dep without Protocol | Repository/Adapter | Check dep injection |
 | 0 domain classes, many functions | Missing domain model | Count classes vs functions |
 
-#### 5f. Tests — any FAIL → REJECTED
+#### 6f. Tests — any FAIL → REJECTED
 
 | Check | How to check | PASS | FAIL |
 |---|---|---|---|
@@ -124,10 +131,10 @@ Read the source files changed in this feature. **Do this before running lint/sta
 | No internal attribute access | Search for `_x` in assertions | None found | `_x`, `isinstance`, `type()` |
 | Every `@id` has a mapped test | Match `@id` to test functions | All mapped | Missing test |
 | No orphaned skipped stubs | Search for `@pytest.mark.skip` in `tests/features/` | None found | Any found — stub was written but never implemented |
-| Function naming | Matches `test_<rule_slug>_<8char_hex>` | All match | Mismatch |
+| Function naming | Matches `test_<feature_slug>_<8char_hex>` | All match | Mismatch |
 | Hypothesis tests have `@slow` | Read every `@given` for `@slow` marker | All present | Any missing |
 
-#### 5g. Code Quality — any FAIL → REJECTED
+#### 6g. Code Quality — any FAIL → REJECTED
 
 | Check | How to check | PASS | FAIL |
 |---|---|---|---|
@@ -136,7 +143,7 @@ Read the source files changed in this feature. **Do this before running lint/sta
 | Public functions have type hints | Read signatures | All annotated | Missing |
 | Public functions have docstrings | Read source | Google-style | Missing |
 
-### 6. Run Verification Commands
+### 7. Run Verification Commands
 
 ```bash
 uv run task lint
@@ -148,28 +155,16 @@ Expected for each: exit 0, no errors. Record exact output on failure.
 
 If a command fails, stop and REJECT immediately. Do not run subsequent commands.
 
-### 7. Interactive Verification
+### 8. Interactive Verification
 
 If the feature involves user interaction: run the app, provide real input, verify output changes.
 
 Record what input was given and what output was observed.
 
-### 8. Self-Declaration Audit
-
-Read the software-engineer's Self-Declaration from `TODO.md`.
-
-For every **AGREE** claim:
-- Find the `file:line` — does it hold?
-
-For every **DISAGREE** claim:
-- REJECT — the software-engineer must fix before requesting review again.
-
-Undeclared violations → REJECT.
-
 ### 9. Write the Report
 
 ```markdown
-## Step 4 Verification Report — <feature-name>
+## Step 4 Verification Report — <feature-stem>
 
 ### pyproject.toml Gate
 | Check | Result | Notes |
@@ -190,29 +185,47 @@ Undeclared violations → REJECT.
 | uv run task test | PASS / FAIL | |
 
 ### Self-Declaration Audit
-| Claim | Software-Engineer Claims | Reviewer Verdict | Evidence |
-|------|-------------------------|------------------|----------|
-| YAGNI | AGREE/DISAGREE | PASS/FAIL | |
-| KISS | AGREE/DISAGREE | PASS/FAIL | |
-| DRY | AGREE/DISAGREE | PASS/FAIL | |
-| SOLID-S | AGREE/DISAGREE | PASS/FAIL | |
-| SOLID-O | AGREE/DISAGREE | PASS/FAIL | |
-| SOLID-L | AGREE/DISAGREE | PASS/FAIL | |
-| SOLID-I | AGREE/DISAGREE | PASS/FAIL | |
-| SOLID-D | AGREE/DISAGREE | PASS/FAIL | |
-| OC-1 | AGREE/DISAGREE | PASS/FAIL | |
-| OC-2 | AGREE/DISAGREE | PASS/FAIL | |
-| OC-3 | AGREE/DISAGREE | PASS/FAIL | |
-| OC-4 | AGREE/DISAGREE | PASS/FAIL | |
-| OC-5 | AGREE/DISAGREE | PASS/FAIL | |
-| OC-6 | AGREE/DISAGREE | PASS/FAIL | |
-| OC-7 | AGREE/DISAGREE | PASS/FAIL | |
-| OC-8 | AGREE/DISAGREE | PASS/FAIL | |
-| OC-9 | AGREE/DISAGREE | PASS/FAIL | |
-| Patterns Creational | AGREE/DISAGREE | PASS/FAIL | |
-| Patterns Structural | AGREE/DISAGREE | PASS/FAIL | |
-| Patterns Behavioral | AGREE/DISAGREE | PASS/FAIL | |
-| Semantic | AGREE/DISAGREE | PASS/FAIL | |
+| # | Claim | SE Claims | Reviewer Verdict | Evidence |
+|---|-------|-----------|------------------|----------|
+| 1 | YAGNI: no code without a failing test | AGREE/DISAGREE | PASS/FAIL | |
+| 2 | YAGNI: no speculative abstractions | AGREE/DISAGREE | PASS/FAIL | |
+| 3 | KISS: simplest solution that passes | AGREE/DISAGREE | PASS/FAIL | |
+| 4 | KISS: no premature optimization | AGREE/DISAGREE | PASS/FAIL | |
+| 5 | DRY: no duplication | AGREE/DISAGREE | PASS/FAIL | |
+| 6 | DRY: no redundant comments | AGREE/DISAGREE | PASS/FAIL | |
+| 7 | SOLID-S: one reason to change per class | AGREE/DISAGREE | PASS/FAIL | |
+| 8 | SOLID-O: open for extension, closed for modification | AGREE/DISAGREE | PASS/FAIL | |
+| 9 | SOLID-L: subtypes substitutable | AGREE/DISAGREE | PASS/FAIL | |
+| 10 | SOLID-I: no forced unused deps | AGREE/DISAGREE | PASS/FAIL | |
+| 11 | SOLID-D: depend on abstractions, not concretions | AGREE/DISAGREE | PASS/FAIL | |
+| 12 | OC-1: one level of indentation per method | AGREE/DISAGREE | PASS/FAIL | |
+| 13 | OC-2: no else after return | AGREE/DISAGREE | PASS/FAIL | |
+| 14 | OC-3: primitive types wrapped | AGREE/DISAGREE | PASS/FAIL | |
+| 15 | OC-4: first-class collections | AGREE/DISAGREE | PASS/FAIL | |
+| 16 | OC-5: one dot per line | AGREE/DISAGREE | PASS/FAIL | |
+| 17 | OC-6: no abbreviations | AGREE/DISAGREE | PASS/FAIL | |
+| 18 | OC-7: ≤20 lines per function, ≤50 per class | AGREE/DISAGREE | PASS/FAIL | |
+| 19 | OC-8: ≤2 instance variables (behavioural classes only) | AGREE/DISAGREE | PASS/FAIL | |
+| 20 | OC-9: no getters/setters | AGREE/DISAGREE | PASS/FAIL | |
+| 21 | Patterns: no good reason remains to refactor using OOP or Design Patterns | AGREE/DISAGREE | PASS/FAIL | |
+| 22 | Patterns: no creational smell | AGREE/DISAGREE | PASS/FAIL | |
+| 23 | Patterns: no structural smell | AGREE/DISAGREE | PASS/FAIL | |
+| 24 | Patterns: no behavioral smell | AGREE/DISAGREE | PASS/FAIL | |
+| 25 | Semantic: tests operate at same abstraction as AC | AGREE/DISAGREE | PASS/FAIL | |
+
+### Reviewer Stance Declaration
+
+Write this block **before** the Decision. Every `DISAGREE` must include an inline explanation. A `DISAGREE` with no explanation auto-forces `REJECTED`.
+
+```markdown
+## Reviewer Stance Declaration
+As a reviewer I declare:
+* Adversarial: I actively tried to find a failure mode, not just confirm passing — AGREE/DISAGREE | note:
+* Manual trace: I traced at least one execution path manually beyond automated output — AGREE/DISAGREE | path:
+* Boundary check: I checked the boundary conditions and edge cases of every Rule — AGREE/DISAGREE | gaps:
+* Semantic read: I read each test against its AC and confirmed it tests the right observable behavior — AGREE/DISAGREE | mismatches:
+* Independence: my verdict was not influenced by how much effort has already been spent — AGREE/DISAGREE
+```
 
 ### Decision
 **APPROVED** — all gates passed, no undeclared violations
@@ -225,25 +238,4 @@ OR
 **If REJECTED**: Run `@software-engineer` — apply the fixes listed above, re-run quality gate, update Self-Declaration, then signal Step 4 again.
 ```
 
-## Standards Summary
-
-| Check | Standard |
-|---|---|
-| Test coverage | 100% |
-| Type errors | 0 |
-| Lint errors | 0 |
-| Function length | ≤ 20 lines |
-| Class length | ≤ 50 lines |
-| Max nesting | 2 levels |
-| Instance variables | ≤ 2 per class |
-| Uncovered `@id` tags | 0 |
-| `noqa` comments | 0 |
-| `type: ignore` | 0 |
-| Semantic alignment mismatches | 0 |
-| SOLID FAIL rows | 0 |
-| ObjCal FAIL rows | 0 |
-| Design pattern FAIL rows | 0 |
-| Duplicate `@id` in tests | 0 |
-| Empty evidence cells | 0 |
-| Orphaned tests | 0 |
-| Hypothesis tests missing `@pytest.mark.slow` | 0 |
+
diff --git a/AGENTS.md b/AGENTS.md
index 1223815..e483107 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -5,16 +5,16 @@ A Python template to quickstart any project with a production-ready workflow, qu
 ## Workflow Overview
 
 Features flow through 5 steps with a WIP limit of 1 feature at a time. The filesystem enforces WIP:
-- `docs/features/backlog/<feature-name>.feature` — features waiting to be worked on
-- `docs/features/in-progress/<feature-name>.feature` — exactly one feature being built right now
-- `docs/features/completed/<feature-name>.feature` — accepted and shipped features
+- `docs/features/backlog/<feature-stem>.feature` — features waiting to be worked on
+- `docs/features/in-progress/<feature-stem>.feature` — exactly one feature being built right now
+- `docs/features/completed/<feature-stem>.feature` — accepted and shipped features
 
 ```
 STEP 1: SCOPE          (product-owner)  → discovery + Gherkin stories + criteria
-STEP 2: ARCH           (software-engineer)      → read all features + existing package files, write domain stubs (signatures only, no bodies); ADRs to docs/architecture/
+STEP 2: ARCH           (software-engineer)      → read all features + existing package files, write domain stubs (signatures only, no bodies); decisions appended to docs/architecture.md
 STEP 3: TDD LOOP       (software-engineer)      → RED → GREEN → REFACTOR, one @id at a time
 STEP 4: VERIFY         (reviewer)       → run all commands, review code
-STEP 5: ACCEPT         (product-owner)  → demo, validate, move folder to completed/
+STEP 5: ACCEPT         (product-owner)  → demo, validate, move .feature to completed/ (PO only)
 ```
 
 **PO picks the next feature from backlog. Software-engineer never self-selects.**
@@ -23,14 +23,25 @@ STEP 5: ACCEPT         (product-owner)  → demo, validate, move folder to compl
 
 ## Roles
 
-- **Product Owner (PO)** — AI agent. Interviews the stakeholder, writes discovery docs, Gherkin features, and acceptance criteria. Accepts or rejects deliveries.
+- **Product Owner (PO)** — AI agent. Interviews the stakeholder, writes discovery docs, Gherkin features, and acceptance criteria. Accepts or rejects deliveries. **Sole owner of all `.feature` file moves** (backlog → in-progress before Step 2; in-progress → completed after Step 5 acceptance).
 - **Stakeholder** — Human. Answers PO's questions, provides domain knowledge, approves PO syntheses to confirm discovery is complete.
-- **Software Engineer** — AI agent. Architecture, test bodies, implementation, git. Never edits `.feature` files. Escalates spec gaps to PO.
-- **Reviewer** — AI agent. Adversarial verification. Reports spec gaps to PO.
+- **Software Engineer** — AI agent. Architecture, test bodies, implementation, git. Never edits or moves `.feature` files. Escalates spec gaps to PO. If no `.feature` file is in `in-progress/`, stops and escalates to PO.
+- **Reviewer** — AI agent. Adversarial verification. Reports spec gaps to PO. Never moves `.feature` files. After APPROVED report, stops and escalates to PO for Step 5.
+
+## Feature File Chain of Responsibility
+
+`.feature` files are owned exclusively by the PO. **No other agent ever moves or edits them.**
+
+| Transition | Who | When |
+|---|---|---|
+| `backlog/` → `in-progress/` | PO only | Before Step 2 begins; only if `Status: BASELINED` |
+| `in-progress/` → `completed/` | PO only | After Step 5 acceptance |
+
+**If an agent (SE or reviewer) finds no `.feature` in `in-progress/`**: update TODO.md with the correct `Next:` escalation line and stop. Never self-select a backlog feature.
 
 ## Agents
 
-- **product-owner** — defines scope (4 phases), picks features, accepts deliveries
+- **product-owner** — defines scope (Stage 1 Discovery + Stage 2 Specification), picks features, accepts deliveries
 - **software-engineer** — architecture, tests, code, git, releases (Steps 2-3 + release)
 - **reviewer** — runs commands and reviews code at Step 4, produces APPROVED/REJECTED report
 - **setup-project** — one-time setup to initialize a new project from this template
@@ -49,52 +60,75 @@ STEP 5: ACCEPT         (product-owner)  → demo, validate, move folder to compl
 | `code-quality` | software-engineer | pre-handoff (redirects to `verify`) |
 | `pr-management` | software-engineer | 5 |
 | `git-release` | software-engineer | 5 |
+| `living-docs` | product-owner | 5 (after acceptance) + on stakeholder demand |
 | `create-skill` | software-engineer | meta |
 | `create-agent` | human-user | meta |
 
 **Session protocol**: Every agent loads `skill session-workflow` at session start. Load additional skills as needed for the current step.
 
-## Step 1 — SCOPE (4 Phases)
+## Step 1 — SCOPE
+
+Step 1 has two stages:
+
+### Stage 1 — Discovery (PO + stakeholder, iterative)
+
+Discovery is a continuous process. Sessions happen whenever scope needs to be established or refined — for a new project, new features, or new information. Every session follows the same structure:
+
+**Session question order:**
+1. **General** (5Ws + Success + Failure + Out-of-scope) — first session only, if the journal doesn't exist yet
+2. **Cross-cutting** — behavior groups, bounded contexts, integration points, lifecycle events
+3. **Per-feature** — one feature at a time; extract entities from `docs/discovery.md` Domain Model; gap-finding with CIT, Laddering, CI Perspective Change
+
+**Real-time split rule**: if the PO detects >2 concerns or >8 candidate Examples for a feature during per-feature questions, split immediately — record the split in the journal, create stub `.feature` files, continue questions for both in the same session.
 
-### Phase 1 — Project Discovery (once per project)
-PO creates `docs/features/discovery.md` using the 3-session template. **Skip Phase 1 entirely if `discovery.md` Status is BASELINED.** To add features to an existing project: append new questions to Session 1 and re-fill from there.
+**After questions (PO alone, in order):**
+1. Append answered Q&A (in groups) to `docs/discovery_journal.md` — only answered questions
+2. Rewrite `.feature` description for each feature touched — others stay unchanged
+3. Append session synthesis block to `docs/discovery.md` — LAST, after all `.feature` updates
 
-- **Session 1** — Individual scope elicitation: 5Ws + Success + Failure + Out-of-scope. Gap-finding per answer using CIT, Laddering, and CI Perspective Change. PO writes synthesis; stakeholder confirms or corrects. PO runs silent pre-mortem on confirmed synthesis. Template §1 must be confirmed before Session 2.
-- **Session 2** — Behavior groups / big picture: questions target behavior groups and cross-cutting concerns. Gap-finding per group. Level 2 synthesis when transitioning between groups. Template §2 must be complete before Session 3.
-- **Session 3** — Synthesis approval + feature derivation: PO produces full synthesis of all behavior groups; stakeholder approves or corrects (PO refines until approved). Domain analysis: nouns/verbs → subject areas → FDD "Action object" feature names. Create `backlog/<name>.feature` stubs. Write `Status: BASELINED` to `discovery.md`.
+**Session status**: the journal session header begins with `Status: IN-PROGRESS` (written before questions). Updated to `Status: COMPLETE` after all writes. If a session is interrupted, the next agent detects `IN-PROGRESS` and resumes the pending writes before starting a new session.
 
-### Phase 2 — Feature Discovery (per feature)
-Each `.feature` file has its own 3-session discovery template in its description. **Sessions are enforced by the template: each section must be filled before proceeding to the next.**
+**Baselining**: PO writes `Status: BASELINED (YYYY-MM-DD)` in the `.feature` file when the stakeholder approves that feature's discovery and the decomposition check passes.
 
-- **Session 1** — Individual entity elicitation: populate Entities table from project discovery; generate questions from entity gaps using CIT, Laddering, CI Perspective Change. PO writes synthesis; stakeholder confirms. Silent pre-mortem on confirmed synthesis.
-- **Session 2** — Behavior groups / big picture: questions target behavior groups within this feature. Gap-finding per group. Level 2 group transition summaries.
-- **Session 3** — Feature synthesis approval + story derivation: PO produces synthesis of feature scope and behavior groups; stakeholder approves or corrects (PO refines until approved). Story candidates become candidate user stories (Rules). Write `Status: BASELINED` to `.feature` discovery section.
+Commit per session: `feat(discovery): <session summary>`
 
-**Decomposition check**: after Session 3, does this feature span >2 distinct concerns OR have >8 candidate Examples? YES → split into separate `.feature` files, re-run Phase 2. NO → proceed.
+### Stage 2 — Specification (PO alone, per feature)
 
-### Phase 3 — Stories (PO alone)
-Story candidates from Phase 2 Session 2 → one `Rule:` block per user story. Each `Rule:` has the user story header (`As a / I want / So that`) as its description — no `Example:` blocks yet. INVEST gate: all 6 letters must pass. Commit: `feat(stories): write user stories for <name>`
+Only runs on features with `Status: BASELINED`. No stakeholder involvement. If a gap requires stakeholder input, open a new Stage 1 session first.
 
-### Phase 4 — Criteria (PO alone)
-Pre-mortem per Rule (all Rules must be checked before writing Examples). Write `Example:` blocks — declarative Given/When/Then, MoSCoW triage (Must/Should/Could) per Example. Review checklist (4.3). Commit: `feat(criteria): write acceptance criteria for <name>`
+**Step A — Stories**: derive one `Rule:` block per user story from the baselined feature description. INVEST gate: all 6 letters must pass.
+Commit: `feat(stories): write user stories for <name>`
 
-**Criteria are frozen**: no `Example:` changes after commit. Adding new Example with new `@id` replaces old.
+**Step B — Criteria**: PO writes `Example:` blocks with `@id` tags under each `Rule:`. Pre-mortem per Rule before writing any Examples. MoSCoW triage per Example. Examples are frozen after commit.
+Commit: `feat(criteria): write acceptance criteria for <name>`
+
+**Criteria are frozen**: no `Example:` changes after commit. Adding a new Example with a new `@id` replaces old.
+
+### Bug Handling
+
+When a defect is reported:
+1. **PO** adds a `@bug` Example to the relevant `Rule:` in the `.feature` file and moves (or keeps) the feature in `backlog/` for normal scheduling.
+2. **SE** handles the bug when the feature is selected for development (standard Step 2–3 flow): implements the specific `@bug`-tagged test in `tests/features/<feature_slug>/` and also writes a `@given` Hypothesis property test in `tests/unit/` covering the whole class of inputs.
+3. Both tests are required. SE follows the normal TDD loop (Step 3).
 
 ## Filesystem Structure
 
 ```
-docs/features/
-  discovery.md                        ← project-level (Status + Questions only)
-  backlog/<feature-name>.feature      ← one per feature; discovery + Rules + Examples
-  in-progress/<feature-name>.feature  ← file moves here at Step 2
-  completed/<feature-name>.feature    ← file moves here at Step 5
-
-docs/architecture/
-  STEP2-ARCH.md                       ← Step 2 reference diagram (canonical)
-  adr-NNN-<title>.md                  ← one per significant architectural decision
+docs/
+  discovery_journal.md                ← raw Q&A, PO appends after every session
+  discovery.md                        ← synthesis changelog, PO appends after every session
+  architecture.md                     ← all architectural decisions, SE appends after Step 2
+  glossary.md                         ← living glossary, PO updates via living-docs skill
+  c4/
+    context.md                        ← C4 Level 1 diagram, PO updates via living-docs skill
+    container.md                      ← C4 Level 2 diagram, PO updates via living-docs skill
+  features/
+    backlog/<feature-stem>.feature    ← narrative + Rules + Examples
+    in-progress/<feature-stem>.feature
+    completed/<feature-stem>.feature
 
 tests/
-  features/<feature-name>/
+  features/<feature_slug>/
     <rule_slug>_test.py               ← one per Rule: block, software-engineer-written
   unit/
     <anything>_test.py                ← software-engineer-authored extras (no @id traceability)
@@ -109,33 +143,24 @@ Tests in `tests/unit/` are software-engineer-authored extras not covered by any
 ## Test File Layout
 
 ```
-tests/features/<feature-name>/<rule_slug>_test.py
+tests/features/<feature_slug>/<rule_slug>_test.py
 ```
 
-### Function Naming
+### Stub Format
 
-```python
-def test_<rule_slug>_<8char_hex>() -> None:
-```
-
-### Docstring Format (mandatory)
+Stubs are auto-generated by pytest-beehave. The SE triggers generation at Step 2 end by running `uv run task test-fast`. pytest-beehave reads the in-progress `.feature` file and creates one skipped function per `@id`:
 
 ```python
 @pytest.mark.skip(reason="not yet implemented")
-def test_wall_bounce_a3f2b1c4() -> None:
+def test_<feature_slug>_<@id>() -> None:
     """
-    Given: A ball moving upward reaches y=0
-    When: The physics engine processes the next frame
-    Then: The ball velocity y-component becomes positive
+    <@id steps raw text including new lines>
     """
-    # Given
-    # When
-    # Then
 ```
 
 ### Markers
 - `@pytest.mark.slow` — takes > 50ms; applied to Hypothesis tests and any test with I/O, network, or DB
-- `@pytest.mark.deprecated` — auto-skipped by conftest; used for superseded Examples
+- `@pytest.mark.deprecated` — auto-skipped by pytest-beehave; used for superseded Examples
 
 ## Development Commands
 
@@ -155,8 +180,8 @@ uv run task test-fast
 # Run full test suite with coverage
 uv run task test
 
-# Run slow tests only
-uv run task test-slow
+# Run tests with coverage report generation
+uv run task test-build
 
 # Lint and format
 uv run task lint
@@ -164,32 +189,30 @@ uv run task lint
 # Type checking
 uv run task static-check
 
-# Serve documentation
-uv run task doc-serve
+# Build documentation
+uv run task doc-build
 ```
 
 ## Code Quality Standards
 
-- **Principles (in priority order)**: YAGNI > KISS > DRY > SOLID > Object Calisthenics
-- **Linting**: ruff, Google docstring convention, `noqa` forbidden
+- **Principles (in priority order)**: YAGNI > KISS > DRY > SOLID > Object Calisthenics > appropriate design patterns > complex code > complicate code > failing code > no code
+- **Linting**: ruff format, ruff check, Google docstring convention, `noqa` forbidden
 - **Type checking**: pyright, 0 errors required
 - **Coverage**: 100% (measured against your actual package)
-- **Function length**: ≤ 20 lines
-- **Class length**: ≤ 50 lines
+- **Function length**: ≤ 20 lines (code lines only, excluding docstrings)
+- **Class length**: ≤ 50 lines (code lines only, excluding docstrings)
 - **Max nesting**: 2 levels
 - **Instance variables**: ≤ 2 per class *(exception: dataclasses, Pydantic models, value objects, and TypedDicts are exempt — they may carry as many fields as the domain requires)*
 - **Semantic alignment**: tests must operate at the same abstraction level as the acceptance criteria they cover
-- **Integration tests**: multi-component features require at least one test in `tests/features/` that exercises the public entry point end-to-end
 
 ### Software-Engineer Quality Gate Priority Order
 
 During Step 3 (TDD Loop), correctness priorities are:
 
-1. **Design correctness** — YAGNI > KISS > DRY > SOLID > Object Calisthenics > appropriate design patterns
+1. **Design correctness** — YAGNI > KISS > DRY > SOLID > Object Calisthenics > appropriated design patterns > complex code > complicated code > failing code > no code
 2. **One test green** — the specific test under work passes, plus `test-fast` still passes
-3. **Reviewer code-design check** — reviewer verifies design + semantic alignment (no lint/pyright/coverage)
-4. **Commit** — only after reviewer APPROVED
-5. **Quality tooling** — `lint`, `static-check`, full `test` with coverage run only at software-engineer handoff (before Step 5)
+3. **Reviewer code-design check** — reviewer verifies design + semantic alignment (no lint/pyright/coverage yet)
+5. **Quality tooling** — `lint`, `static-check`, full `test` with coverage run only at software-engineer handoff (before Step 4)
 
 Design correctness is far more important than lint/pyright/coverage compliance. A well-designed codebase with minor lint issues is better than a lint-clean codebase with poor design.
 
@@ -200,10 +223,6 @@ Design correctness is far more important than lint/pyright/coverage compliance.
 - Both are required. All-green automated checks are necessary but not sufficient for APPROVED.
 - Reviewer defaults to REJECTED unless correctness is proven.
 
-## Deprecation Process
-
-This template does not support deprecation. Criteria changes are handled by adding new Examples with new `@id` tags.
-
 ## Release Management
 
 Version format: `v{major}.{minor}.{YYYYMMDD}`
@@ -212,13 +231,13 @@ Version format: `v{major}.{minor}.{YYYYMMDD}`
 - Same-day second release: increment minor, keep same date
 - Each release gets a unique adjective-animal name
 
-Use `@software-engineer /skill git-release` for the full release process.
+Use `@software-engineer /skill git-release` for the full release process. When requested by the stakeholder
 
 ## Session Management
 
 Every session: load `skill session-workflow`. Read `TODO.md` first, update it at the end.
 
-`TODO.md` is a session bookmark — not a project journal. See `docs/workflow.md` for the full structure including the Cycle State and Self-Declaration blocks used during Step 4.
+`TODO.md` is a session bookmark — not a project journal. See `.opencode/skills/session-workflow/SKILL.md` for the full structure including the Cycle State block used during Step 3.
 
 ## Setup
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 21916ee..9415c05 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,60 @@
 
 All notable changes to this template will be documented in this file.
 
+## [v6.2.20260419] - Autonomous Stenella - 2026-04-19
+
+### Added
+- **pytest-beehave integration**: `@id` tags now auto-assigned to untagged `Example:` blocks on every `pytest` run; test stubs auto-generated from `.feature` files at Step 2 end — no manual ID generation or stub writing required (#78)
+- **Self-declaration defense in depth**: all 25 items numbered 1–25 in `implementation/SKILL.md`; `verify/SKILL.md` now hard-gates on completeness (count must equal 25, sequence must be gapless) before item audit begins (#78)
+
+### Changed
+- **Naming convention**: `.feature` file paths now use `<feature-stem>` (kebab); test directories use `<feature_slug>` (underscore) — applied consistently across all skills, `AGENTS.md`, and docs (#78)
+- **`conftest.py`**: removed manual `deprecated` marker skip hook — now owned entirely by pytest-beehave (#78)
+- **`scope/SKILL.md`**: removed all manual `@id` generation instructions and `@id` uniqueness checklist items — assignment is automatic (#78)
+- **`product-owner.md`**: removed `@id` from bug handling and gap-resolution table — PO writes `Example:` blocks only (#78)
+- **README**: added "Why this template?" section; added `pytest-beehave` to tooling table; replaced static stub example with a two-part Gherkin-in → stub-out illustration (#78)
+- **`verify/SKILL.md` report table**: expanded Self-Declaration Audit from 21 collapsed rows to 25 numbered rows matching the implementation template exactly (#78)
+
+## [v6.1.20260419] - Contextual Ambystoma - 2026-04-19 (hotfix)
+
+### Added
+- **living-docs skill**: new PO skill for generating C4 architecture diagrams (`docs/c4/context.md`, `docs/c4/container.md`) and maintaining the living glossary (`docs/glossary.md`) after each feature acceptance (Step 5) or on stakeholder demand
+- **docs/c4/**: new directory for C4 Level 1 (Context) and Level 2 (Container) Mermaid diagrams; placeholder `.gitkeep` added
+- **docs/glossary.md**: new living glossary file owned by `living-docs` skill (PO); terms sourced from completed feature files, `docs/discovery.md` Domain Model, and `docs/architecture.md` decisions
+- **Scientific research — documentation.md**: new file with 4 entries (#59–62): Ko et al. 2007 (information needs), Winters et al. 2020 (docs-as-code), Procida 2021 (Diátaxis framework), Allspaw 2012 (blameless post-mortems)
+- **Scientific research — domain-modeling.md**: 6 new DDD entries (#63–68): Evans DDD Reference CC-BY, Fowler UbiquitousLanguage bliki, Fowler BoundedContext bliki, Vernon IDDD, Verraes "UL is not a glossary", Evans Whirlpool process
+- **Scientific research — architecture.md**: 4 new entries (#55–58): Nygard ADRs, Kruchten 4+1 View Model, Brown C4 Model, Parnas information hiding
+
+### Changed
+- **discovery.md template**: `### Scope` section renamed to `### Context` — the section is a session-level general-context synthesis, not a complete project scope definition
+- **scope/SKILL.md**: updated `### Scope` references to `### Context` in Step C instructions and template block
+- **living-docs/SKILL.md**: glossary entry format updated — `**Context:**` renamed to `**Bounded context:**` (mandatory for multi-context projects); `Domain Event` added as a distinct Type value; secondary-artifact note added to preamble; source-traceability rule replaces "do not invent" rule; checklist updated accordingly
+- **implementation/SKILL.md**: Step 2 Read Phase now includes `docs/glossary.md` as item 2 — SE reads existing domain terms before naming classes, methods, and modules to avoid inventing synonyms
+- **create-skill/SKILL.md**: `living-docs` added to available skills table
+- **AGENTS.md**: skills table updated with `living-docs`; filesystem structure section updated (`docs/c4/`, `docs/glossary.md` added; `docs/architecture/` subtree removed; TODO.md reference updated)
+
+### Removed
+- **docs/architecture/**: folder deleted; the ADR log lives at `docs/architecture.md` (SE-owned); the old `adr-template.md` inside the folder was redundant
+- **docs/workflow.md**: deleted; canonical workflow reference is `AGENTS.md` and the skills under `.opencode/skills/`
+- **Dockerfile / docker-compose.yml**: removed as unused template artifacts
+
+## [v6.0.20260419] - Declarative Nautilus - 2026-04-19
+
+### Added
+- **PO Self-Declaration**: mandatory 11-claim checklist (INVEST I/V/S/T, observable Then, no impl details, entity coverage, distinct examples, unique IDs, pre-mortem, scope boundary) written into TODO.md at end of Stage 2 Step B before criteria commit; every DISAGREE is a hard blocker (#71)
+- **Reviewer Stance Declaration**: 5-claim block (adversarial mindset, manual trace, boundary check, semantic read, independence) added to verify/SKILL.md report template before APPROVED/REJECTED verdict; DISAGREE allowed with explanation, unexplained DISAGREE = REJECTED (#71)
+- **session-workflow**: Step 1 Stage 2 Criteria TODO format section with full Self-Declaration template and Rule 9 enforcing the declaration before criteria commit (#71)
+- **Three append-only project docs**: `docs/discovery_journal.md` (raw Q&A), `docs/discovery.md` (synthesis changelog), `docs/architecture.md` (architectural decisions) replace the old flat `docs/features/discovery.md` (#70)
+
+### Changed
+- **Discovery model** (breaking): Phase 1 / Phase 2 / Phase 3 / Phase 4 replaced by 2-stage model — Stage 1 Discovery (unified iterative sessions, PO + stakeholder) and Stage 2 Specification (PO alone, per BASELINED feature) (#70)
+- **Feature file moves** (breaking): PO is now the sole owner of all `.feature` file moves (backlog → in-progress and in-progress → completed); SE and reviewer explicitly prohibited from moving files with clear escalation protocol (#70)
+- **Session protocol**: discovery journal sessions use `Status: IN-PROGRESS` / `Status: COMPLETE` markers; real-time split rule (>2 concerns or >8 candidate Examples splits within the same session); journal writes only answered Q&A in groups (#70)
+- **Bug handling**: explicit protocol — PO adds `@bug @id` Example, SE writes both the `@id` test in `tests/features/` and a `@given` Hypothesis property test in `tests/unit/`; both required (#70)
+- **scope/SKILL.md**: full rewrite to 2-stage model with session start checklist, question order (general → cross-cutting → per-feature), after-questions steps, baselining section, and bug handling section (#70)
+- **feature-selection/SKILL.md**: updated "Phase 4 (Criteria)" reference to "Stage 2 Step B (Criteria)" (#70)
+- **All agent files and skills**: updated to reflect new document model, terminology, and chain of responsibility (#70, #71)
+
 ## [v5.2.20260418] - Emergent Colugo - 2026-04-18 (hotfix)
 
 ### Fixed
diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index e6645ec..0000000
--- a/Dockerfile
+++ /dev/null
@@ -1,51 +0,0 @@
-# syntax=docker/dockerfile:1.7
-# Simplified Dockerfile for python-project-template
-# Single-stage development-focused build
-
-ARG PYTHON_VERSION=3.13.1
-
-FROM python:${PYTHON_VERSION}-slim AS base
-
-# Install uv for fast Python package management
-RUN pip install --upgrade pip uv
-
-# Create non-root user
-RUN groupadd --gid 1001 appuser && \
-    useradd --uid 1001 --gid appuser --shell /bin/bash --create-home appuser
-
-WORKDIR /app
-
-# Copy dependency files first (better layer caching)
-COPY pyproject.toml uv.lock* ./
-
-# Install dependencies
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv sync --locked --dev
-
-# Copy source code
-COPY . .
-
-# Change ownership to appuser
-RUN chown -R appuser:appuser /app
-USER appuser
-
-# Configure Python
-ENV PYTHONPATH=/app
-ENV PYTHONUNBUFFERED=1
-ENV PYTHONDONTWRITEBYTECODE=1
-
-# Expose common ports
-EXPOSE 8000 8080 5678
-
-# Health check
-HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
-    CMD python -m app || exit 1
-
-# Default command
-CMD ["python", "-m", "app"]
-
-# Labels
-LABEL maintainer="eol"
-LABEL version="3.0.20260414"
-LABEL description="Python template with some awesome tools to quickstart any Python project"
-LABEL org.opencontainers.image.source="https://github.com/nullhack/python-project-template"
\ No newline at end of file
diff --git a/README.md b/README.md
index cc4aed1..a89bb28 100644
--- a/README.md
+++ b/README.md
@@ -32,23 +32,35 @@ uv run task test && uv run task lint && uv run task static-check
 
 ---
 
-## What You Get
+## Why this template?
 
-### A structured 5-step development cycle
+Most Python templates give you a folder structure and a `Makefile`. This one gives you a **complete delivery system**:
+
+- **No feature starts without written acceptance criteria** — Gherkin `Example:` blocks traced to tests
+- **No feature ships without adversarial review** — the reviewer's default hypothesis is "broken"
+- **No guesswork on test stubs** — they are generated automatically from your `.feature` files
+- **No manual `@id` tags** — assigned automatically when you run tests
+- **AI agents for every role** — PO, SE, and reviewer each have scoped instructions; none can exceed their authority
+
+---
+
+## How it works
+
+### 5-step delivery cycle
 
 ```
 SCOPE → ARCH → TDD LOOP → VERIFY → ACCEPT
 ```
 
-| Step | Who | What |
-|------|-----|------|
-| **SCOPE** | Product Owner | Discovery interviews → Gherkin stories → `@id` criteria |
-| **ARCH** | Software Engineer | Module design, ADRs, test stubs |
-| **TDD LOOP** | Software Engineer | RED → GREEN → REFACTOR, one `@id` at a time |
-| **VERIFY** | Reviewer | Adversarial verification — default hypothesis: broken |
-| **ACCEPT** | Product Owner | Demo, validate, ship |
+| Step | Role | Output |
+|------|------|--------|
+| **1 · SCOPE** | Product Owner | Discovery interviews + Gherkin stories + acceptance criteria |
+| **2 · ARCH** | Software Engineer | Module stubs, ADRs, auto-generated test stubs |
+| **3 · TDD LOOP** | Software Engineer | RED → GREEN → REFACTOR, one criterion at a time |
+| **4 · VERIFY** | Reviewer | Adversarial check — lint, types, coverage, semantic review |
+| **5 · ACCEPT** | Product Owner | Demo, validate, ship |
 
-WIP limit of 1. Features are `.feature` files that move between filesystem folders:
+**WIP limit: 1 feature at a time.** Features are `.feature` files that move through folders:
 
 ```
 docs/features/backlog/      ← waiting
@@ -58,12 +70,12 @@ docs/features/completed/    ← shipped
 
 ### AI agents included
 
-```
-@product-owner      — scope, stories, acceptance
-@software-engineer  — architecture, TDD, git, releases
-@reviewer           — adversarial verification
-@setup-project      — one-time project initialisation
-```
+| Agent | Responsibility |
+|-------|---------------|
+| `@product-owner` | Scope, stories, acceptance criteria, delivery acceptance |
+| `@software-engineer` | Architecture, TDD loop, git, releases |
+| `@reviewer` | Adversarial verification — default position: broken |
+| `@setup-project` | One-time project initialisation |
 
 ### Quality tooling, pre-configured
 
@@ -73,6 +85,7 @@ docs/features/completed/    ← shipped
 | `ruff` | Lint + format (Google docstrings) |
 | `pyright` | Static type checking — 0 errors |
 | `pytest` + `hypothesis` | Tests + property-based testing |
+| `pytest-beehave` | Auto-generates test stubs from `.feature` files |
 | `pytest-cov` | Coverage — 100% required |
 | `pdoc` | API docs → GitHub Pages |
 | `taskipy` | Task runner |
@@ -91,7 +104,7 @@ uv run task run           # Run the app
 
 ---
 
-## Code Standards
+## Code standards
 
 | | |
 |---|---|
@@ -104,19 +117,31 @@ uv run task run           # Run the app
 
 ---
 
-## Test Convention
+## Test convention
+
+Write acceptance criteria in Gherkin:
+
+```gherkin
+@id:a3f2b1c4
+Example: User sees version on startup
+  Given the application starts
+  When no arguments are passed
+  Then the version string is printed to stdout
+```
+
+Run tests once — a traced, skipped stub appears automatically:
 
 ```python
 @pytest.mark.skip(reason="not yet implemented")
-def test_feature_a3f2b1c4() -> None:
+def test_display_version_a3f2b1c4() -> None:
     """
-    Given: ...
-    When:  ...
-    Then:  ...
+    Given the application starts
+    When no arguments are passed
+    Then the version string is printed to stdout
     """
 ```
 
-Each test is traced to exactly one `@id` acceptance criterion.
+Each test is traced to exactly one acceptance criterion. No orphan tests. No untested criteria.
 
 ---
 
diff --git a/docker-compose.yml b/docker-compose.yml
deleted file mode 100644
index f8708f8..0000000
--- a/docker-compose.yml
+++ /dev/null
@@ -1,86 +0,0 @@
-# Docker Compose for python-project-template
-# Simplified development setup
-
-services:
-  # =============================================================================
-  # Main application
-  # =============================================================================
-  app:
-    build:
-      context: .
-      dockerfile: Dockerfile
-    container_name: python-template-app
-    volumes:
-      # Hot reload: mount source code
-      - ./app:/app/app
-      - ./tests:/app/tests
-      - ./pyproject.toml:/app/pyproject.toml:ro
-    ports:
-      - "8000:8000"    # Main application
-      - "8080:8080"    # Documentation server
-      - "5678:5678"    # Debug port
-    environment:
-      - PYTHONPATH=/app
-      - PYTHONUNBUFFERED=1
-      - DEVELOPMENT=true
-    command: python -m app
-    restart: unless-stopped
-
-  # =============================================================================
-  # Test runner
-  # =============================================================================  
-  test:
-    build:
-      context: .
-      dockerfile: Dockerfile
-    container_name: python-template-test
-    volumes:
-      - ./:/app:ro
-    environment:
-      - PYTHONPATH=/app
-      - PYTHONUNBUFFERED=1
-    command: task test
-    profiles:
-      - test
-
-  # =============================================================================
-  # Documentation server
-  # =============================================================================
-  docs:
-    build:
-      context: .
-      dockerfile: Dockerfile
-    container_name: python-template-docs
-    volumes:
-      - ./app:/app/app:ro
-      - ./pyproject.toml:/app/pyproject.toml:ro
-    ports:
-      - "8080:8080"
-    environment:
-      - PYTHONPATH=/app
-    command: task doc-serve
-    profiles:
-      - docs
-
-  # =============================================================================
-  # Code quality checks
-  # =============================================================================
-  quality:
-    build:
-      context: .
-      dockerfile: Dockerfile
-    container_name: python-template-quality
-    volumes:
-      - ./:/app:ro
-    environment:
-      - PYTHONPATH=/app
-    command: bash -c "task lint && task static-check"
-    profiles:
-      - quality
-
-# =============================================================================
-# Networks
-# =============================================================================
-networks:
-  default:
-    name: python-template-network
\ No newline at end of file
diff --git a/docs/architecture.md b/docs/architecture.md
new file mode 100644
index 0000000..2edabcd
--- /dev/null
+++ b/docs/architecture.md
@@ -0,0 +1,19 @@
+# Architecture: <project-name>
+
+---
+
+## YYYY-MM-DD — <feature-stem>: <short title>
+
+Decision: <what was decided — one sentence>
+Reason: <why — one sentence>
+Alternatives considered: <what was rejected and why>
+Feature: <feature-stem>
+
+---
+
+## YYYY-MM-DD — Cross-feature: <short title>
+
+Decision: <what was decided>
+Reason: <why>
+Alternatives considered: <what was rejected and why>
+Affected features: <feature-stem>, <feature-stem>
diff --git a/docs/architecture/adr-template.md b/docs/architecture/adr-template.md
deleted file mode 100644
index d86faf9..0000000
--- a/docs/architecture/adr-template.md
+++ /dev/null
@@ -1,10 +0,0 @@
-# ADR-NNN: <title>
-
-**Status:** PROPOSED | ACCEPTED | SUPERSEDED by ADR-NNN
-
-**Decision:** <what was decided — one sentence>
-
-**Reason:** <why — one sentence>
-
-**Alternatives considered:**
-- <option>: <why rejected>
diff --git a/docs/c4/.gitkeep b/docs/c4/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/docs/discovery.md b/docs/discovery.md
new file mode 100644
index 0000000..9b8a33f
--- /dev/null
+++ b/docs/discovery.md
@@ -0,0 +1,21 @@
+# Discovery: <project-name>
+
+---
+
+## Session: YYYY-MM-DD
+
+### Context
+<3–5 sentence synthesis: who the users are, what the product does, why it exists,
+success/failure conditions, and explicit out-of-scope boundaries.>
+(First session only. Omit this subsection in subsequent sessions.)
+
+### Feature List
+- `<feature-stem>` — <one-sentence description of what changed or was added>
+(Write "No changes" if no features were added or modified this session.)
+
+### Domain Model
+| Type | Name | Description | In Scope |
+|------|------|-------------|----------|
+| Noun | <name> | <description> | Yes |
+| Verb | <name> | <description> | Yes |
+(Write "No changes" if domain model was not updated this session.)
diff --git a/docs/discovery_journal.md b/docs/discovery_journal.md
new file mode 100644
index 0000000..ef538fe
--- /dev/null
+++ b/docs/discovery_journal.md
@@ -0,0 +1,32 @@
+# Discovery Journal: <project-name>
+
+---
+
+## YYYY-MM-DD — Session 1
+Status: IN-PROGRESS
+
+### General
+
+| ID | Question | Answer |
+|----|----------|--------|
+| Q1 | Who are the users? | ... |
+| Q2 | What does the product do at a high level? | ... |
+| Q3 | Why does it exist — what problem does it solve? | ... |
+| Q4 | When and where is it used? | ... |
+| Q5 | Success — what does "done" look like? | ... |
+| Q6 | Failure — what must never happen? | ... |
+| Q7 | Out-of-scope — what are we explicitly not building? | ... |
+
+### <Group Name>
+
+| ID | Question | Answer |
+|----|----------|--------|
+| Q8 | ... | ... |
+
+### Feature: <feature-stem>
+
+| ID | Question | Answer |
+|----|----------|--------|
+| Q9 | ... | ... |
+
+Status: COMPLETE
diff --git a/docs/features/completed/display-version.feature b/docs/features/completed/display-version.feature
index be7059a..0dfc3dd 100644
--- a/docs/features/completed/display-version.feature
+++ b/docs/features/completed/display-version.feature
@@ -1,22 +1,12 @@
 Feature: Display version
 
-  Discovery:
+  Reads the application version from pyproject.toml at runtime and logs it at INFO
+  level. Log output is controlled by a verbosity parameter; the version is visible
+  at DEBUG and INFO but suppressed at WARNING and above. An invalid verbosity value
+  raises a descriptive error.
 
   Status: COMPLETED
 
-  Entities:
-  | Type | Name             | Candidate Class/Method      | In Scope |
-  |------|------------------|-----------------------------|----------|
-  | Noun | Version string   | version()                   | Yes      |
-  | Noun | pyproject.toml   | (source of truth)           | Yes      |
-  | Noun | Log output       | logging                     | Yes      |
-  | Noun | Verbosity level  | ValidVerbosity              | Yes      |
-  | Noun | Entry point      | main()                      | Yes      |
-  | Verb | Retrieve         | version()                   | Yes      |
-  | Verb | Display / Log    | main()                      | Yes      |
-  | Verb | Configure        | ValidVerbosity              | Yes      |
-  | Verb | Validate         | main() raises ValueError    | Yes      |
-
   Rules (Business):
   - Version is read from pyproject.toml at runtime using tomllib
   - Log verbosity is controlled by a ValidVerbosity parameter passed to main()
@@ -29,12 +19,6 @@ Feature: Display version
   - Entry point: app/__main__.py (main(verbosity) function)
   - Version logic: app/version.py (version() function)
 
-  Questions:
-  | ID | Question | Answer | Status |
-  |----|----------|--------|--------|
-
-  All questions answered. Discovery frozen.
-
   Rule: Version retrieval
     As a software-engineer
     I want to retrieve the application version programmatically
diff --git a/docs/features/discovery.md b/docs/features/discovery.md
deleted file mode 100644
index f764e10..0000000
--- a/docs/features/discovery.md
+++ /dev/null
@@ -1,41 +0,0 @@
-# Discovery: <project-name>
-
-## State
-Status: ELICITING
-
----
-
-## Session 1 — Individual Scope Elicitation
-
-| ID | Question | Answer | Status |
-|----|----------|--------|--------|
-| Q1 | Who are the users of this product? | | OPEN |
-| Q2 | What does the product do at a high level? | | OPEN |
-| Q3 | Why does it exist — what problem does it solve? | | OPEN |
-| Q4 | When and where is it used (environment, platform, context)? | | OPEN |
-| Q5 | How do we know it works? What does "done" look like? | | OPEN |
-| Q6 | What does failure look like? What must never happen? | | OPEN |
-| Q7 | What are we explicitly not building? | | OPEN |
-
-Template §1: PENDING
-Synthesis: (fill after stakeholder confirms answers)
-Pre-mortem: (fill after synthesis is confirmed)
-
----
-
-## Session 2 — Behavior Groups / Big Picture
-
-| ID | Question | Answer | Status |
-|----|----------|--------|--------|
-
-Template §2: PENDING
-Behavior Groups:
-- (fill after all group questions are answered)
-
----
-
-## Session 3 — Full Synthesis
-
-(fill after Sessions 1 and 2 are complete)
-
-Template §3: PENDING
diff --git a/docs/scientific-research/README.md b/docs/scientific-research/README.md
index cb9fd99..3338996 100644
--- a/docs/scientific-research/README.md
+++ b/docs/scientific-research/README.md
@@ -8,8 +8,9 @@ Theoretical and empirical foundations for the decisions made in this template, o
 | `testing.md` | 11–15, 51–54 | Observable behavior testing, test-behavior alignment, first-class tests, property-based testing, mutation testing, Canon TDD, GOOS outer/inner loop, Is TDD Dead, BDD origin |
 | `software-economics.md` | 16 | Cost of change curve (shift left) |
 | `requirements-elicitation.md` | 17–20, 28–30, 43–50 | INVEST, Example Mapping, declarative Gherkin, MoSCoW, active listening, Kipling 5Ws, BA framework, FDD, affinity mapping, Event Storming, CIT, cognitive interview, laddering, funnel technique, RE issues |
-| `domain-modeling.md` | 31 | DDD bounded contexts, ubiquitous language, feature identification |
+| `domain-modeling.md` | 31, 63–68 | DDD bounded contexts, ubiquitous language, feature identification, DDD Reference, Fowler UL/BC bliki, Vernon IDDD, Verraes UL-not-glossary, Whirlpool |
 | `oop-design.md` | 32–35 | Object Calisthenics, Refactoring (Fowler), GoF Design Patterns, SOLID |
 | `refactoring-empirical.md` | 36–41 | QDIR smell prioritization, smells + architectural refactoring, SPIRIT tool, bad OOP engineering properties, CWC complexity metric, metric threshold unreliability |
-| `architecture.md` | 42 | Hexagonal Architecture — ports and adapters |
+| `architecture.md` | 42, 55–58 | Hexagonal Architecture, ADRs, 4+1 View Model, C4 model, information hiding |
 | `ai-agents.md` | 21–27 | Minimal-scope agent design, context isolation, on-demand skills, instruction conflict resolution failure, positional attention degradation, modular prompt de-duplication, three-file separation |
+| `documentation.md` | 59–62 | Developer information needs, docs-as-code, Diátaxis documentation framework, blameless post-mortems |
diff --git a/docs/scientific-research/architecture.md b/docs/scientific-research/architecture.md
index 5b5bb5f..8cf3a9d 100644
--- a/docs/scientific-research/architecture.md
+++ b/docs/scientific-research/architecture.md
@@ -18,7 +18,69 @@ Foundations for the architectural decisions and patterns used in this template.
 
 ---
 
+### 55. Architecture Decision Records (ADRs)
+
+| | |
+|---|---|
+| **Source** | Nygard, M. T. (2011). "Documenting Architecture Decisions." *cognitect.com*. https://cognitect.com/blog/2011/11/15/documenting-architecture-decisions |
+| **Date** | 2011 |
+| **Alternative** | Keeling, M. (2017). *Design It!: From Programmer to Software Architect*. Pragmatic Bookshelf. (Chapter 6: "Architectural Decisions") |
+| **Status** | Confirmed — widely adopted industry standard; tooled by adr-tools, ADR Manager, Log4Brains |
+| **Core finding** | Architectural decisions should be recorded as short, immutable documents capturing: what was decided, why, and what alternatives were rejected. Without this record, decisions get re-litigated by every new developer (or AI agent) who encounters the codebase, producing rework and re-divergence. |
+| **Mechanism** | An ADR is written at decision time, never edited afterward. If the decision changes, a new ADR is written that supersedes the old one. The append-only record becomes a reliable audit trail. The constraint "one sentence per field" forces clarity — if you can't state the reason in one sentence, the decision is not yet understood. |
+| **Where used** | `docs/architecture/architecture.md` (ADR template). SE appends one block per non-obvious decision after Step 2. The `living-docs` skill reads ADRs as input for C4 diagram annotations. |
+
+---
+
+### 56. The 4+1 View Model of Architecture
+
+| | |
+|---|---|
+| **Source** | Kruchten, P. B. (1995). "The 4+1 View Model of Architecture." *IEEE Software*, 12(6), 42–50. https://doi.org/10.1109/52.469759 |
+| **Date** | 1995 |
+| **Alternative** | Bass, L., Clements, P., & Kazman, R. (2021). *Software Architecture in Practice* (4th ed.). Addison-Wesley. |
+| **Status** | Confirmed — 3,000+ citations; foundational IEEE reference for architectural documentation |
+| **Core finding** | A single architectural diagram cannot communicate all relevant aspects of a system. Four distinct views are required: **Logical** (domain objects and relationships), **Process** (runtime behavior and concurrency), **Development** (module organisation and dependencies), **Physical** (deployment topology). A fifth **Scenarios** view (use cases) ties the four together by showing how each scenario exercises each view. |
+| **Mechanism** | Different stakeholders need different views: a developer needs the Development view; an operator needs the Physical view; a domain expert needs the Logical view. Conflating views into one diagram produces a cluttered diagram that satisfies nobody. The 4+1 model assigns each concern to its appropriate view and cross-validates them through scenarios. |
+| **Where used** | Theoretical foundation for the C4 model (entry 57). The `living-docs` skill generates C4 diagrams that map to: Context diagram (Scenarios view), Container diagram (Physical + Development views), Component diagram (Logical + Development views). |
+
+---
+
+### 57. The C4 Model for Software Architecture
+
+| | |
+|---|---|
+| **Source** | Brown, S. (2018). *The C4 Model for Software Architecture*. Leanpub. https://c4model.com |
+| **Date** | 2018 (ongoing) |
+| **Alternative** | Brown, S. (2023). "The C4 model for visualising software architecture." *InfoQ*. |
+| **Status** | Confirmed — widely adopted; tooled by Structurizr, PlantUML C4, Mermaid C4 |
+| **Core finding** | Software architecture can be communicated at four zoom levels: **Level 1 — System Context** (who uses the system and what external systems it talks to), **Level 2 — Container** (major runnable/deployable units), **Level 3 — Component** (major structural building blocks within a container), **Level 4 — Code** (classes, interfaces; usually auto-generated). Each level answers a specific question; mixing levels in one diagram creates confusion. |
+| **Mechanism** | C4 operationalises the 4+1 View Model (entry 56) into a lightweight notation that can be expressed in text (PlantUML, Mermaid) and version-controlled alongside code. The notation is deliberately constrained: boxes (people, systems, containers, components) and unidirectional arrows with labels. No UML formalism required. Context + Container diagrams cover >90% of communication needs for most teams. |
+| **Where used** | The `living-docs` skill generates and updates C4 diagrams in `docs/c4/`. Context diagram (L1) always generated; Container (L2) generated when multiple containers are identified; Component (L3) generated on demand. Source files are Mermaid so they render in GitHub and are version-controlled. |
+
+---
+
+### 58. Information Hiding — Module Decomposition Criterion
+
+| | |
+|---|---|
+| **Source** | Parnas, D. L. (1972). "On the criteria to be used in decomposing systems into modules." *Communications of the ACM*, 15(12), 1053–1058. https://doi.org/10.1145/361598.361623 |
+| **Date** | 1972 |
+| **Alternative** | Parnas, D. L. (1974). "On a 'buzzword': Hierarchical structure." *Proc. IFIP Congress 74*, 336–339. |
+| **Status** | Confirmed — 4,000+ citations; foundational criterion for all modular decomposition in software engineering |
+| **Core finding** | The correct criterion for decomposing a system into modules is **information hiding**: each module hides a design decision that is likely to change. A module's interface reveals only what callers need; its implementation hides how. Decomposing by execution steps (procedure-based) creates tight coupling to implementation order; decomposing by change-prone decisions (information-hiding) allows each decision to be changed independently. |
+| **Mechanism** | Identify which decisions are most likely to change (data structures, algorithms, I/O formats, external service protocols). Each such decision becomes a module boundary. The module's public interface is defined to be change-stable; the implementation is change-free from the caller's perspective. This is the theoretical basis for SOLID-D (depend on abstractions), Hexagonal Architecture (hide external decisions behind ports), and DDD bounded contexts (hide language decisions behind context boundaries). |
+| **Where used** | Step 2 Architecture: bounded context check ("same word, different meaning across features? → module boundary") and external dep Protocol assignment both apply the information-hiding criterion. The `living-docs` skill uses module boundaries as container/component boundaries in `docs/c4/` diagrams. |
+
+---
+
 ## Bibliography
 
-1. Cockburn, A. (2005). Hexagonal Architecture. *alistair.cockburn.us*. https://alistair.cockburn.us/hexagonal-architecture/
-2. Freeman, S., & Pryce, N. (2009). *Growing Object-Oriented Software, Guided by Tests*. Addison-Wesley.
+1. Bass, L., Clements, P., & Kazman, R. (2021). *Software Architecture in Practice* (4th ed.). Addison-Wesley.
+2. Brown, S. (2018). *The C4 Model for Software Architecture*. Leanpub. https://c4model.com
+3. Cockburn, A. (2005). Hexagonal Architecture. *alistair.cockburn.us*. https://alistair.cockburn.us/hexagonal-architecture/
+4. Freeman, S., & Pryce, N. (2009). *Growing Object-Oriented Software, Guided by Tests*. Addison-Wesley.
+5. Keeling, M. (2017). *Design It!: From Programmer to Software Architect*. Pragmatic Bookshelf.
+6. Kruchten, P. B. (1995). The 4+1 View Model of Architecture. *IEEE Software*, 12(6), 42–50. https://doi.org/10.1109/52.469759
+7. Nygard, M. T. (2011). Documenting Architecture Decisions. *cognitect.com*. https://cognitect.com/blog/2011/11/15/documenting-architecture-decisions
+8. Parnas, D. L. (1972). On the criteria to be used in decomposing systems into modules. *CACM*, 15(12), 1053–1058. https://doi.org/10.1145/361598.361623
diff --git a/docs/scientific-research/documentation.md b/docs/scientific-research/documentation.md
new file mode 100644
index 0000000..9c77a00
--- /dev/null
+++ b/docs/scientific-research/documentation.md
@@ -0,0 +1,69 @@
+# Scientific Research — Documentation
+
+Foundations for living documentation, docs-as-code, information architecture, and post-mortem practices used in this template.
+
+---
+
+### 59. Information Needs in Collocated Software Development Teams
+
+| | |
+|---|---|
+| **Source** | Ko, A. J., DeLine, R., & Venolia, G. (2007). "Information Needs in Collocated Software Development Teams." *Proc. 29th International Conference on Software Engineering (ICSE 2007)*, pp. 344–353. IEEE. https://doi.org/10.1109/ICSE.2007.45 |
+| **Date** | 2007 |
+| **Alternative** | Dagenais, B., & Robillard, M. P. (2010). "Creating and evolving developer documentation." *Proc. FSE 2010*, pp. 127–136. ACM. |
+| **Status** | Confirmed — empirical study; 600+ citations |
+| **Core finding** | Developers spend 35–50% of their working time not writing code but searching for information — navigating code, reading past decisions, and understanding relationships between components. The most frequently sought information is: who wrote this, why was it written this way, and what does this module depend on. Direct questioning of teammates is the most common fallback when documentation is absent, creating serial bottlenecks. |
+| **Mechanism** | Information seeking is triggered by a task, not by curiosity. A developer encountering an unfamiliar component has a specific decision to make. When documentation is absent, the seek-ask-wait loop (find the right person, ask, wait for a response) dominates time. Persistent documentation (ADRs, architecture diagrams, glossary) short-circuits this loop by making the answer findable without a human intermediary. |
+| **Where used** | Justifies the full `living-docs` skill: C4 diagrams answer "what does this module depend on?"; the ADR record answers "why was it written this way?"; the living glossary answers "what does this term mean in this context?". Collectively these eliminate the three most frequent information needs identified by Ko et al. |
+
+---
+
+### 60. Software Engineering at Google — Documentation Chapter
+
+| | |
+|---|---|
+| **Source** | Winters, T., Manshreck, T., & Wright, H. (2020). *Software Engineering at Google: Lessons Learned from Programming Over Time*. O'Reilly. Chapter 10: "Documentation." https://abseil.io/resources/swe-book/html/ch10.html |
+| **Date** | 2020 |
+| **Alternative** | Fitzpatrick, B., & Collins-Sussman, B. (2012). *Team Geek*. O'Reilly. |
+| **Status** | Confirmed — large-scale industry evidence from a codebase with ~2 billion lines of code |
+| **Core finding** | Documentation that lives outside the code repository decays at a rate proportional to how often the code changes — because there is no mechanism that forces the doc to be updated when the code changes. Docs-as-code (documentation in the same repo, reviewed in the same PRs, tested in the same CI pipeline) dramatically reduces divergence because the cost of updating the doc is incurred at the same moment as the cost of the code change. |
+| **Mechanism** | Google's g3doc system co-locates docs with the code they describe. When a PR changes `payments/service.py`, the reviewer also sees `payments/README.md` in the diff and can flag staleness immediately. At scale, Google found that docs with no co-located tests or CI checks become stale within 3–6 months regardless of team discipline. |
+| **Where used** | Justifies co-locating `docs/` within the project repository. Living docs (`docs/architecture/c4/`, `docs/glossary.md`) are updated in the same commits as the code they describe. The `living-docs` skill is the mechanism that enforces this — it runs after Step 5 to regenerate diagrams from the current state of the codebase and discovery docs. |
+
+---
+
+### 61. Diátaxis — A Systematic Framework for Technical Documentation
+
+| | |
+|---|---|
+| **Source** | Procida, D. (2021). "Diátaxis — A systematic approach to technical documentation." *diataxis.fr*. https://diataxis.fr |
+| **Date** | 2021 |
+| **Status** | Confirmed — adopted by Django, NumPy, Gatsby, Cloudflare, and the Python Software Foundation |
+| **Core finding** | Technical documentation fails because it conflates four fundamentally different needs into a single undifferentiated text. The four types are: **Tutorials** (learning-oriented; guides a beginner through a complete task), **How-to guides** (task-oriented; solves a specific problem for a practitioner), **Reference** (information-oriented; describes the system accurately and completely), **Explanation** (understanding-oriented; discusses concepts and decisions). Each type has a different audience mental state and requires a different writing mode. Mixing them degrades all four. |
+| **Mechanism** | The two axes of Diátaxis are: **practical ↔ theoretical** (tutorials and how-to guides are practical; reference and explanation are theoretical) and **acquiring ↔ applying** (tutorials and explanation are for acquiring knowledge; how-to guides and reference are for applying it). A document that tries to be both a tutorial and a reference simultaneously will be a poor tutorial (too much information) and a poor reference (not structured for lookup). |
+| **Where used** | Documentation structure in this template maps to Diátaxis: `README.md` = tutorial (getting started), `AGENTS.md` = reference (complete description of roles, skills, commands) and explanation (why the workflow exists), `docs/c4/` = reference (system structure), post-mortems = explanation (why decisions were made). The `living-docs` skill produces reference-type documentation (C4 diagrams, glossary) — not tutorials. |
+
+---
+
+### 62. Blameless Post-Mortems and a Just Culture
+
+| | |
+|---|---|
+| **Source** | Allspaw, J. (2012). "Blameless PostMortems and a Just Culture." *code.etsy.com* (archived). https://www.etsy.com/codeascraft/blameless-postmortems/ |
+| **Date** | 2012 |
+| **Alternative** | Dekker, S. (2006). *The Field Guide to Understanding Human Error*. Ashgate. |
+| **Status** | Confirmed — foundational DevOps/SRE practice; referenced in Google SRE Book (2016) |
+| **Core finding** | Post-mortems that assign blame produce less information and lower long-term system reliability than blameless post-mortems. When individuals believe they will be blamed, they withhold information about contributing factors, preventing the systemic causes from being identified and fixed. A blameless post-mortem treats the incident as a system failure, not an individual failure — asking "what conditions allowed this to happen?" not "who caused this?" |
+| **Mechanism** | Allspaw's model separates two questions: (1) what happened? (factual, blameless) and (2) what changes would prevent recurrence? (systemic). The post-mortem document records both. The output is not an individual's performance review but a list of system changes — process improvements, documentation gaps, tooling additions. Etsy's incident rate fell after adopting blameless post-mortems because engineers began reporting near-misses that they previously concealed. |
+| **Where used** | `docs/post-mortem/` directory. Post-mortems in this template follow the blameless model: they report workflow gaps found, not who made the mistake. The output of each post-mortem is a list of improvements to skills, agents, or workflow documentation. The `living-docs` skill is one such improvement — it emerged from the discovery that architecture and glossary documentation were falling behind the codebase. |
+
+---
+
+## Bibliography
+
+1. Allspaw, J. (2012). Blameless PostMortems and a Just Culture. *code.etsy.com*. https://www.etsy.com/codeascraft/blameless-postmortems/
+2. Dagenais, B., & Robillard, M. P. (2010). Creating and evolving developer documentation. *Proc. FSE 2010*, pp. 127–136. ACM.
+3. Dekker, S. (2006). *The Field Guide to Understanding Human Error*. Ashgate.
+4. Ko, A. J., DeLine, R., & Venolia, G. (2007). Information Needs in Collocated Software Development Teams. *Proc. ICSE 2007*, pp. 344–353. https://doi.org/10.1109/ICSE.2007.45
+5. Procida, D. (2021). Diátaxis — A systematic approach to technical documentation. *diataxis.fr*. https://diataxis.fr
+6. Winters, T., Manshreck, T., & Wright, H. (2020). *Software Engineering at Google*. O'Reilly. Chapter 10. https://abseil.io/resources/swe-book/html/ch10.html
diff --git a/docs/scientific-research/domain-modeling.md b/docs/scientific-research/domain-modeling.md
index d49be2e..eb9143e 100644
--- a/docs/scientific-research/domain-modeling.md
+++ b/docs/scientific-research/domain-modeling.md
@@ -14,7 +14,92 @@ Foundations for bounded context identification, ubiquitous language, and feature
 | **Status** | Confirmed — foundational DDD literature |
 | **Core finding** | A Bounded Context is a boundary within which a particular ubiquitous language is consistent. Features are identified by grouping related user stories that share the same language. The decomposition criterion is "single responsibility per context" + "consistency of language." |
 | **Mechanism** | In DDD: (1) Extract ubiquitous language from requirements → (2) Group by language consistency → (3) Each group is a candidate bounded context → (4) Each bounded context maps to a feature. Context Mapper automates this: User Stories → Subdomains (via noun/verb extraction) → Bounded Contexts of type FEATURE. |
-| **Where used** | Phase 1: after feature list identification, verify each feature has consistent language. Phase 2: noun/verb extraction from project discovery answers populates the Entities table — domain analysis cannot begin before this. The "Rules (Business)" section captures the ubiquitous language rules that govern each feature. |
+| **Where used** | Stage 1 Discovery: after session synthesis, verify each feature has consistent language. Noun/verb extraction from discovery answers builds the Domain Model in `docs/discovery.md`. The `Rules (Business):` section in `.feature` files captures the ubiquitous language rules that govern each feature. |
+
+---
+
+### 63. DDD Reference — Pattern Summaries (CC-BY)
+
+| | |
+|---|---|
+| **Source** | Evans, E. (2015). *DDD Reference: Definitions and Pattern Summaries*. domainlanguage.com. https://www.domainlanguage.com/ddd/reference/ |
+| **Date** | 2015 |
+| **Alternative** | Evans, E. (2003). *Domain-Driven Design*. Addison-Wesley. (full book; entry #31) |
+| **Status** | Confirmed — freely available CC-BY canonical summary; maintained by Evans personally |
+| **Core finding** | The open-access pattern summary of all DDD patterns from the 2003 book. More precisely citable than the book for specific pattern definitions. Key patterns: Ubiquitous Language ("Use the model as the backbone of a language. Commit the team to exercising that language relentlessly in all communication within the team and in the code."), Bounded Context, Context Map, Domain Events, Aggregates, Repositories. |
+| **Mechanism** | Each pattern is described with: intent, prescription, and "therefore" consequences. The Ubiquitous Language pattern prescribes: use the same terms in diagrams, writing, and especially speech. Refactor the code when the language changes. Resolve confusion over terms in conversation, the way confusion over ordinary words is resolved — by agreement and precision. |
+| **Where used** | Primary reference for `docs/discovery.md` Domain Model structure and the ubiquitous language practice. `living-docs` skill glossary entries derive from this: terms must match code identifiers (Evans' "use the same language in code" prescription). `docs/scientific-research/domain-modeling.md`. |
+| **Note** | Supersedes entry #31 as the citable source for specific pattern quotes. Entry #31 remains as the book reference. Use this entry when citing a specific Evans pattern definition. |
+
+---
+
+### 64. UbiquitousLanguage — Fowler Bliki
+
+| | |
+|---|---|
+| **Source** | Fowler, M. (2006). "UbiquitousLanguage." *martinfowler.com*. https://martinfowler.com/bliki/UbiquitousLanguage.html |
+| **Date** | 2006 |
+| **Alternative** | Evans (2015) DDD Reference (entry #63) — the primary source Fowler summarises |
+| **Status** | Confirmed — widely cited secondary source; Fowler wrote the DDD foreword and is considered the authoritative secondary interpreter of Evans |
+| **Core finding** | The ubiquitous language is a practice, not a document. The glossary is a secondary artifact — a snapshot of the current state of the language. The language itself lives in conversation, in the code, and in all written communication. "By using the model-based language pervasively and not being satisfied until it flows, we approach a model that is complete and comprehensible." Domain experts must object to inadequate terms; developers must flag ambiguity. |
+| **Mechanism** | The key test of a ubiquitous language: can a domain expert read the domain layer code and recognize their domain? If the code uses different names than the glossary, the code must be refactored — not the glossary relaxed. The language evolves through experimentation with alternative expressions, followed by code refactoring to match the new model. |
+| **Where used** | `living-docs` skill — grounds the rule "verify each term matches the identifier used in the code's domain layer." `docs/glossary.md` — the glossary is explicitly secondary to the code. `docs/scientific-research/domain-modeling.md`. |
+
+---
+
+### 65. BoundedContext — Fowler Bliki
+
+| | |
+|---|---|
+| **Source** | Fowler, M. (2014). "BoundedContext." *martinfowler.com*. https://martinfowler.com/bliki/BoundedContext.html |
+| **Date** | 2014 |
+| **Alternative** | Evans (2015) DDD Reference (entry #63) — Fowler cites Evans directly |
+| **Status** | Confirmed — includes a direct Evans quote; the canonical accessible reference for Bounded Context as a design pattern |
+| **Core finding** | "Total unification of the domain model for a large system will not be feasible or cost-effective" (Evans, quoted directly). The same word can mean different things in different Bounded Contexts — this is not a defect but a reflection of domain reality. "You need a different model when the language changes." A Bounded Context is the boundary within which a particular ubiquitous language is internally consistent. Terms must be qualified by their context when a project has more than one bounded context. |
+| **Mechanism** | Fowler's electricity utility example: the word "meter" meant different things in billing, grid management, and customer service. Attempting to unify these into one definition created confusion. Each bounded context maintains its own model and its own language. Context Maps document the relationships and translation rules between bounded contexts. |
+| **Where used** | `living-docs` skill — `**Bounded context:**` field in `docs/glossary.md` entries is mandatory when the project has more than one bounded context (this is the Evans/Fowler requirement). `docs/scientific-research/domain-modeling.md`. |
+
+---
+
+### 66. Implementing Domain-Driven Design
+
+| | |
+|---|---|
+| **Source** | Vernon, V. (2013). *Implementing Domain-Driven Design*. Addison-Wesley. |
+| **Date** | 2013 |
+| **Alternative** | Evans (2003) DDD (entry #31) — Vernon explicitly builds on Evans |
+| **Status** | Confirmed — second most cited DDD book; ~5,000 citations |
+| **Core finding** | Three additions to Evans: (1) **Domain Events as first-class vocabulary** — past-tense verb phrases ("OrderPlaced," "VersionDisplayed") are part of the ubiquitous language and belong in the glossary as a distinct type. (2) **Context Maps as the organizing principle** for multi-context glossaries — each bounded context has its own language documentation; the Context Map shows translation rules between contexts. (3) **Documentation co-located with the code** — docs in the same repository decay at the same rate as the code, dramatically reducing divergence. |
+| **Mechanism** | Vernon's IDDD samples (github.com/VaughnVernon/IDDD_Samples) demonstrate all three in practice. The Product Owner / Business Analyst plays the domain-expert-representative role in glossary maintenance — validating semantic correctness — while developers own structural precision. Neither writes the glossary unilaterally. |
+| **Where used** | `living-docs` skill — `Domain Event` added as a distinct Type value in `docs/glossary.md` entries. Grounds the PO-owned glossary with SE input via `docs/architecture.md` Reason: fields. `docs/scientific-research/domain-modeling.md`. |
+
+---
+
+### 67. Ubiquitous Language Is Not a Glossary — Verraes
+
+| | |
+|---|---|
+| **Source** | Verraes, M. (2013). "Ubiquitous Language Is Not a Glossary." *verraes.net*. https://web.archive.org/web/20131004/https://verraes.net/2013/04/ubiquitous-language-is-not-a-glossary/ |
+| **Date** | 2013 |
+| **Alternative** | Fowler (2006) UbiquitousLanguage (entry #64) — the same secondary-artifact point, less pointed |
+| **Status** | Confirmed — original URL is 404; widely documented through community discussion and practitioner secondary accounts; thesis is uncontested in the DDD community |
+| **Core finding** | A glossary is not a ubiquitous language. Teams that maintain a glossary but do not reflect its terms in the code have the *appearance* of a ubiquitous language without the substance. The glossary is a secondary artifact derived from the code and domain-expert conversations — not the reverse. The canonical source of truth is the domain layer code, not the glossary document. A glossary that diverges from the code is lying. |
+| **Mechanism** | The test: can a domain expert read the domain layer code and recognize their domain without a translator? If yes, the ubiquitous language exists. If the only evidence of the language is the glossary document, it does not exist. Consequence: every term added to the glossary must be verified against the corresponding code identifier. |
+| **Where used** | `living-docs` skill — grounds the checklist item "Verify each term matches the identifier used in the code's domain layer." Prevents the common failure mode of glossary-as-theatre. `docs/scientific-research/domain-modeling.md`. |
+
+---
+
+### 68. Whirlpool Process of Model Exploration — Evans
+
+| | |
+|---|---|
+| **Source** | Evans, E. (2011). *Whirlpool Process of Model Exploration*. domainlanguage.com. https://www.domainlanguage.com/ddd/whirlpool/ |
+| **Date** | 2011 |
+| **Alternative** | Brandolini, A. (2013). *Introducing EventStorming*. Leanpub. — a later, more structured alternative to Whirlpool |
+| **Status** | Confirmed — freely available; Evans' own post-2003 process guidance |
+| **Core finding** | Model exploration is a cycle: Scenario Exploring → Harvesting Abstractions → Probing the Model → Challenging the Model → back to Scenario Exploring. New vocabulary crystallizes at the Harvesting Abstractions step — concrete scenarios surface candidate terms, which are then named, defined, and reflected in the code. The glossary grows at each Harvesting Abstractions step. |
+| **Mechanism** | The Whirlpool is not a development process — it fits within most iterative processes. It is a model-exploration subprocess triggered whenever the team encounters a poorly understood domain concept. The output of each cycle is a refined model expressed in clearer language, with updated code identifiers and glossary entries. |
+| **Where used** | `living-docs` skill — grounds the timing of glossary updates: after each completed feature (Step 5) corresponds to the Harvesting Abstractions step in the Whirlpool. Discovery sessions (Stage 1) correspond to Scenario Exploring. `docs/scientific-research/domain-modeling.md`. |
 
 ---
 
@@ -22,3 +107,9 @@ Foundations for bounded context identification, ubiquitous language, and feature
 
 1. Context Mapper. (2025). Rapid Object-Oriented Analysis and Design. https://contextmapper.org/docs/rapid-ooad
 2. Evans, E. (2003). *Domain-Driven Design: Tackling Complexity in the Heart of Software*. Addison-Wesley.
+3. Evans, E. (2011). *Whirlpool Process of Model Exploration*. domainlanguage.com. https://www.domainlanguage.com/ddd/whirlpool/
+4. Evans, E. (2015). *DDD Reference: Definitions and Pattern Summaries* (CC-BY). domainlanguage.com. https://www.domainlanguage.com/ddd/reference/
+5. Fowler, M. (2006). UbiquitousLanguage. martinfowler.com. https://martinfowler.com/bliki/UbiquitousLanguage.html
+6. Fowler, M. (2014). BoundedContext. martinfowler.com. https://martinfowler.com/bliki/BoundedContext.html
+7. Vernon, V. (2013). *Implementing Domain-Driven Design*. Addison-Wesley.
+8. Verraes, M. (2013). Ubiquitous Language Is Not a Glossary. verraes.net (archived). https://web.archive.org/web/20131004/https://verraes.net/2013/04/ubiquitous-language-is-not-a-glossary/
diff --git a/docs/scientific-research/requirements-elicitation.md b/docs/scientific-research/requirements-elicitation.md
index ec5e68f..b272727 100644
--- a/docs/scientific-research/requirements-elicitation.md
+++ b/docs/scientific-research/requirements-elicitation.md
@@ -27,7 +27,7 @@ Foundations for the PO interview structure, Gherkin criteria, and feature discov
 | **Status** | Confirmed |
 | **Core finding** | Inserting a "rules" layer between stories and examples prevents redundant or contradictory acceptance criteria. A story with many rules needs splitting; a story with many open questions is not ready for development. |
 | **Mechanism** | Four card types: Story (yellow), Rules (blue), Examples (green), Questions (red). The rules layer groups related examples under the business rule they illustrate. Red cards (unanswered questions) are a first-class signal to stop and investigate. |
-| **Where used** | `## Rules` section in per-feature `discovery.md` (Phase 2). PO identifies business rules before writing Examples in Phase 4. |
+| **Where used** | `Rules (Business):` section in each `.feature` file. PO identifies business rules before writing Examples in Stage 2 Step B. |
 
 ---
 
@@ -40,7 +40,7 @@ Foundations for the PO interview structure, Gherkin criteria, and feature discov
 | **Status** | Confirmed |
 | **Core finding** | Declarative Gherkin ("When Bob logs in") produces specifications that survive UI changes. Imperative Gherkin ("When I click the Login button") couples specs to implementation details and breaks on every UI redesign. |
 | **Mechanism** | Declarative steps describe *what happens* at the business level. Imperative steps describe *how the user interacts with a specific UI*. AI agents are especially prone to writing imperative Gherkin because they mirror literal steps. |
-| **Where used** | Declarative vs. imperative table in Phase 4 of `scope/SKILL.md`. |
+| **Where used** | Declarative vs. imperative table in Stage 2 Step B (Criteria) of `scope/SKILL.md`. |
 
 ---
 
@@ -53,7 +53,7 @@ Foundations for the PO interview structure, Gherkin criteria, and feature discov
 | **Status** | Confirmed |
 | **Core finding** | Classifying requirements as Must/Should/Could/Won't forces explicit negotiation about what is essential vs. desired. When applied *within* a single story, it reveals bloated stories that should be split. |
 | **Mechanism** | DSDM mandates that Musts cannot exceed 60% of total effort. At the story level: if a story has 12 Examples and only 3 are Musts, the remaining 9 can be deferred. This prevents gold-plating and keeps stories small. |
-| **Where used** | MoSCoW triage in Phase 4 of `scope/SKILL.md`. |
+| **Where used** | MoSCoW triage in Stage 2 Step B (Criteria) of `scope/SKILL.md`. |
 
 ---
 
@@ -80,7 +80,7 @@ Foundations for the PO interview structure, Gherkin criteria, and feature discov
 | **Status** | Synthesized rule of thumb — each component individually confirmed |
 | **Core finding** | Active listening in requirements interviews operates at three granularities: **Level 1** (per answer) — immediate paraphrase; **Level 2** (per topic cluster) — transition summary; **Level 3** (end of interview) — full synthesis serving four downstream purposes. |
 | **Level 3 — four uses** | 1. Accuracy gate (NN/G). 2. Scope crystallization (Ambler/FDD). 3. Input to domain modeling (Ambler/FDD). 4. Baseline trigger (Wynne/Cucumber). |
-| **Where used** | Phase 1 and Phase 2 of `scope/SKILL.md`. |
+| **Where used** | Stage 1 Discovery sessions in `scope/SKILL.md`. |
 
 ---
 
@@ -93,7 +93,7 @@ Foundations for the PO interview structure, Gherkin criteria, and feature discov
 | **Alternative** | Hermagoras of Temnos (2nd century BCE) — seven circumstances of rhetoric. |
 | **Status** | Practitioner synthesis — journalism, business analysis, investigative methodology |
 | **Core finding** | The six interrogative questions (Who, What, When, Where, Why, How) form a complete framework for gathering all essential facts about any situation. Together they ensure completeness and prevent gaps. |
-| **Where used** | Phase 1 project discovery: the initial seven questions are an adaptation of the 5W1H framework. |
+| **Where used** | Stage 1 Discovery, General questions (first session): the initial seven questions are an adaptation of the 5W1H framework. |
 
 ---
 
@@ -105,7 +105,7 @@ Foundations for the PO interview structure, Gherkin criteria, and feature discov
 | **Date** | 2025 |
 | **Status** | Practitioner synthesis — consolidated BA methodology, not peer-reviewed |
 | **Core finding** | Ten questions consistently make the most difference in requirements elicitation: (1) What problem are we solving? (2) What happens if we do nothing? (3) Who uses this? (4) What does success look like? (5) Walk me through how this works today. (6) Where does this usually break? (7) What decisions will this help? (8) What should definitely not happen? (9) What happens if input is wrong? (10) What assumptions are we making? |
-| **Where used** | Phase 1 project discovery: the "Success", "Failure", and "Out-of-scope" questions map to this framework. |
+| **Where used** | Stage 1 Discovery, General questions: the "Success", "Failure", and "Out-of-scope" questions map to this framework. |
 
 ---
 
@@ -119,7 +119,7 @@ Foundations for the PO interview structure, Gherkin criteria, and feature discov
 | **Status** | Confirmed |
 | **Core finding** | FDD requires domain modeling *before* feature naming. Features are expressed as "Action result object" triples. Features group into Feature Sets (shared domain object), which group into Subject Areas. |
 | **Mechanism** | Domain modeling extracts the vocabulary (nouns = candidate classes, verbs = candidate methods). Feature identification then asks: "what verbs act on each noun?" |
-| **Where used** | Phase 1 of `scope/SKILL.md`: after interview summary is confirmed, PO performs domain analysis (nouns/verbs → subject areas → FDD "Action object" feature names). |
+| **Where used** | Stage 1 Discovery in `scope/SKILL.md`: after session synthesis, PO performs domain analysis (nouns/verbs → subject areas → FDD "Action object" feature names) for first session. |
 
 ---
 
@@ -132,7 +132,7 @@ Foundations for the PO interview structure, Gherkin criteria, and feature discov
 | **Alternative** | Kawakita, J. (1967). *Abduction*. Chuokoronsha. |
 | **Status** | Confirmed |
 | **Core finding** | Affinity diagramming groups raw observations/requirements into clusters by bottom-up similarity — no categories are named until grouping is complete. This prevents confirmation bias from top-down pre-labelling. |
-| **Where used** | Phase 1 of `scope/SKILL.md` (alternative to FDD domain modeling): PO uses affinity mapping on interview answers to derive feature clusters. Best suited when working from interview transcripts solo. |
+| **Where used** | Stage 1 Discovery in `scope/SKILL.md` (alternative to FDD domain modeling): PO uses affinity mapping on interview answers to derive feature clusters. Best suited when working from interview transcripts solo. |
 
 ---
 
@@ -145,7 +145,7 @@ Foundations for the PO interview structure, Gherkin criteria, and feature discov
 | **Status** | Confirmed |
 | **Core finding** | Event Storming is a collaborative workshop where domain experts place past-tense domain events on a timeline. Sorting the events creates natural Functional Area clusters — these are candidate feature groups. The workshop also produces Ubiquitous Language, a Problem Inventory, and Actor roles. |
 | **Mechanism** | Temporal sequencing of domain events forces resolution of conflicting mental models across organisational silos. Clusters emerge from shared vocabulary and causal proximity. |
-| **Where used** | Optional alternative in Phase 1 of `scope/SKILL.md` for cross-silo discovery. |
+| **Where used** | Optional alternative in Stage 1 Discovery in `scope/SKILL.md` for cross-silo discovery. |
 
 ---
 
@@ -159,7 +159,7 @@ Foundations for the PO interview structure, Gherkin criteria, and feature discov
 | **Status** | Confirmed — foundational; ~200 follow-on empirical studies |
 | **Core finding** | Anchoring an interview on a specific past incident ("Tell me about a time when X broke down") breaks schema-based recall. Stakeholders describing actual past events report real workarounds, edge cases, and failure modes that never surface when asked "how does this usually work?" |
 | **Mechanism** | Direct questions elicit the stakeholder's mental schema — a sanitized, gap-free description of how things *should* work. Incidents bypass the schema because episodic memory is anchored to specific sensory and emotional detail. |
-| **Where used** | Session 2 (gap-finding) of Phase 1 and Phase 2 in `scope/SKILL.md`. |
+| **Where used** | Cross-cutting and per-feature questions (gap-finding) in Stage 1 Discovery in `scope/SKILL.md`. |
 
 ---
 
@@ -173,7 +173,7 @@ Foundations for the PO interview structure, Gherkin criteria, and feature discov
 | **Status** | Confirmed — meta-analysis: Köhnken et al. (1999), *Psychology, Crime & Law*, 5(1-2), 3–27. |
 | **Core finding** | The enhanced CI elicits ~35% more correct information than standard interviews with equal accuracy rates. |
 | **Mechanism** | Four retrieval mnemonics: (1) mental reinstatement of context; (2) report everything; (3) temporal reversal; (4) perspective change. Each mnemonic opens a different memory access route, collectively surfacing what direct questions cannot. |
-| **Where used** | Session 2 (gap-finding) of Phase 1 and Phase 2 in `scope/SKILL.md`. |
+| **Where used** | Cross-cutting and per-feature questions (gap-finding) in Stage 1 Discovery in `scope/SKILL.md`. |
 
 ---
 
@@ -186,7 +186,7 @@ Foundations for the PO interview structure, Gherkin criteria, and feature discov
 | **Status** | Confirmed — operationalised in IS research (Hunter & Beck 2000) |
 | **Core finding** | Repeatedly asking "Why is that important to you?" climbs a means-end chain from concrete attribute → functional consequence → psychosocial consequence → terminal value. The stakeholder's first answer is rarely the real constraint. |
 | **Mechanism** | The Gherkin "So that [benefit]" clause is structurally a single-rung means-end ladder. Full laddering reveals value conflicts between stakeholders whose surface requirements look identical but whose ladders diverge at the consequence level. |
-| **Where used** | Session 2 (gap-finding) of Phase 1 and Phase 2 in `scope/SKILL.md`. |
+| **Where used** | Cross-cutting and per-feature questions (gap-finding) in Stage 1 Discovery in `scope/SKILL.md`. |
 
 ---
 
@@ -200,7 +200,7 @@ Foundations for the PO interview structure, Gherkin criteria, and feature discov
 | **Status** | Confirmed — standard NNG qualitative research protocol |
 | **Core finding** | Starting with broad open-ended questions before narrowing to specifics prevents the interviewer from priming the interviewee's responses. |
 | **Mechanism** | Priming bias is structural: any category name the interviewer introduces activates a schema that filters what the interviewee considers worth reporting. The funnel sequences questions so the interviewee's own categories emerge first. |
-| **Where used** | Within each session of Phase 1 and Phase 2 in `scope/SKILL.md`. |
+| **Where used** | Within each Stage 1 Discovery session in `scope/SKILL.md`. |
 
 ---
 
diff --git a/docs/workflow.md b/docs/workflow.md
deleted file mode 100644
index 9e24894..0000000
--- a/docs/workflow.md
+++ /dev/null
@@ -1,548 +0,0 @@
-# Development Workflow
-
-This document describes the complete feature lifecycle used to develop software with this framework.
-
----
-
-## Overview
-
-Features flow through 5 steps with a WIP limit of 1 feature at a time. The filesystem enforces the limit:
-
-```
-docs/features/backlog/<name>.feature      ← waiting
-docs/features/in-progress/<name>.feature  ← exactly one being built
-docs/features/completed/<name>.feature    ← accepted and shipped
-```
-
-Each step has a designated agent and a specific deliverable. No step is skipped.
-
----
-
-## Full Workflow Diagram
-
-```
-╔══════════════════════════════════════════════════════════════════════╗
-║                    FEATURE LIFECYCLE (WIP = 1)                       ║
-╚══════════════════════════════════════════════════════════════════════╝
-
-  FILESYSTEM ENFORCES WIP:
-  backlog/<name>.feature  →  in-progress/<name>.feature  →  completed/<name>.feature
-
-
-┌─────────────────────────────────────────────────────────────────────┐
-│  STEP 1 — SCOPE                              agent: product-owner   │
-├─────────────────────────────────────────────────────────────────────┤
-│                                                                     │
-│  Phase 1 — Project Discovery                                        │
-│  [runs ONCE; skip if discovery.md BASELINED]                        │
-│  [adding features later: append new Qs to Session 1, re-fill]      │
-│                                                                     │
-│    Session 1 — Individual Scope Elicitation                         │
-│      5Ws + Success + Failure + Out-of-scope                         │
-│      Gap-finding per answer: CIT · Laddering · CI Perspective       │
-│      [new questions from elucidation added in the moment]           │
-│      Level 1: paraphrase each answer on the spot                    │
-│      → PO writes synthesis → stakeholder confirms or corrects       │
-│      → PO runs silent pre-mortem on confirmed synthesis             │
-│      [template §1: synthesis confirmed → unlocks Session 2]         │
-│                                                                     │
-│    Session 2 — Behavior Groups / Big Picture                       │
-│      Questions target behavior groups and cross-cutting concerns   │
-│      Gap-finding per group: CIT · Laddering · CI Perspective         │
-│      [new questions from elucidation added in the moment]           │
-│      Level 1: paraphrase each answer                                │
-│      Level 2: synthesis when transitioning between groups        │
-│      [template §2: all groups answered → unlocks Session 3]          │
-│                                                                     │
-│    Session 3 — Synthesis Approval + Feature Derivation              │
-│      PO produces full synthesis across all behavior groups         │
-│      → stakeholder approves or corrects; PO refines until approved  │
-│      [template §3: approval → unlocks domain analysis]              │
-│      Domain analysis: nouns/verbs → subject areas                   │
-│      Name features (FDD "Action object" / Affinity groups)           │
-│      Create backlog/<name>.feature stubs                            │
-│      Status: BASELINED written to discovery.md                      │
-│                                                                     │
-│  Phase 2 — Feature Discovery (repeats per feature)                  │
-│  [each .feature has its own 3-session discovery template]           │
-│                                                                     │
-│    Session 1 — Individual Entity Elicitation                        │
-│      Populate Entities table from project discovery                 │
-│      Gap-finding per answer: CIT · Laddering · CI Perspective       │
-│      [new questions from elucidation added in the moment]           │
-│      Level 1: paraphrase each answer                                │
-│      → PO writes synthesis → stakeholder confirms or corrects       │
-│      → PO runs silent pre-mortem on confirmed synthesis             │
-│      [template §1: synthesis confirmed → unlocks Session 2]         │
-│                                                                     │
-│    Session 2 — Behavior Groups / Big Picture for this Feature        │
-│      Questions target behavior groups within this feature            │
-│      Gap-finding per group: CIT · Laddering · CI Perspective         │
-│      [new questions from elucidation added in the moment]           │
-│      Level 1: paraphrase · Level 2: group transition summaries      │
-│      [template §2: all groups answered → unlocks Session 3]          │
-│                                                                     │
-│    Session 3 — Feature Synthesis Approval + Story Derivation       │
-│      PO produces synthesis of feature scope and behavior groups     │
-│      → stakeholder approves or corrects; PO refines until approved │
-│      Story candidates → candidate user stories (Rules)               │
-│      Status: BASELINED written to .feature discovery section         │
-│      [template §3: approval + stories → unlocks decomp check]       │
-│                                                                     │
-│    DECOMPOSITION CHECK                                              │
-│      >2 distinct concerns OR >8 candidate Examples?                 │
-│      YES → split into separate .feature files, re-run Phase 2       │
-│      NO  → proceed                                                  │
-│                                                                     │
-│  Phase 3 — Stories (PO alone)                                       │
-│    Story candidates from Phase 2 Session 2 → one Rule: block per story │
-│    INVEST gate: all 6 letters must pass before committing           │
-│    commit: feat(stories): write user stories for <name>             │
-│                                                                     │
-│  Phase 4 — Criteria (PO alone)                                      │
-│    4.1 Pre-mortem per Rule (all Rules checked before Examples)      │
-│    4.2 Write @id-tagged Examples (Given/When/Then, declarative)     │
-│        MoSCoW triage: Must / Should / Could per Example             │
-│    4.3 Review checklist                                             │
-│    commit: feat(criteria): write acceptance criteria for <name>     │
-│    ★ FROZEN — changes require @deprecated + new Example             │
-│                                                                     │
-└─────────────────────────────────────────────────────────────────────┘
-                              ↓  PO picks feature from backlog — only if Status: BASELINED
-┌─────────────────────────────────────────────────────────────────────┐
-│  STEP 2 — ARCHITECTURE                           agent: software-engineer   │
-├─────────────────────────────────────────────────────────────────────┤
-│                                                                     │
-│  PREREQUISITES (stop if any fail — escalate to PO)                 │
-│    [ ] in-progress/ has no .feature file (WIP = 1)                 │
-│    [ ] feature Status: BASELINED                                    │
-│    [ ] feature has Rule: + Example: + @id tags                      │
-│    [ ] package name confirmed (pyproject.toml → directory exists)   │
-│                                                                     │
-│  mv backlog/<name>.feature → in-progress/<name>.feature             │
-│                                                                     │
-│  READ (all before writing anything)                                 │
-│    docs/features/discovery.md (project-level)                      │
-│    ALL backlog .feature files (discovery + entities sections)       │
-│    in-progress .feature file (full: Rules + Examples + @id)        │
-│    ALL existing .py files in <package>/  ← know what exists first  │
-│                                                                     │
-│  DOMAIN ANALYSIS                                                    │
-│    From Entities table + Rules (Business) in .feature file:        │
-│    Nouns → named classes, value objects, aggregates                 │
-│    Verbs → method names with typed signatures                       │
-│    Datasets → named types (not bare dict/list)                      │
-│    Bounded Context check: same word, different meaning across       │
-│      features? → module boundary goes there                         │
-│    Cross-feature entities → candidate shared domain layer           │
-│                                                                     │
-│  SILENT PRE-MORTEM (before writing anything)                        │
-│    "In 6 months this design is a mess. What mistakes did we make?"  │
-│    For each candidate class: >2 ivars? >1 reason to change?         │
-│    For each external dep: is it behind a Protocol?                  │
-│    Any noun serving double duty across modules?                     │
-│    Any structure missing a named design pattern?                    │
-│    → If pattern smell detected: load skill design-patterns          │
-│                                                                     │
-│  WRITE STUBS INTO PACKAGE (signatures only — bodies must be `...`) │
-│    If file exists → add class/method; do not remove existing code  │
-│    If file does not exist → create with signatures only             │
-│    File placement (common patterns, not required names):            │
-│      <package>/domain/<noun>.py   ← entities, value objects        │
-│      <package>/domain/service.py  ← cross-entity operations        │
-│      Do not pre-create ports/ or adapters/ without a concrete dep  │
-│    Stub rules:                                                      │
-│      Bodies: `...` only — no logic, no conditionals                │
-│      No docstrings — add after GREEN when signatures are stable     │
-│      No inline comments, no TODO, no speculative code              │
-│                                                                     │
-│  WRITE ADR FILES (significant decisions only)                       │
-│    docs/architecture/adr-NNN-<title>.md                            │
-│      Decision: <what>  Reason: <why>                               │
-│      Alternatives considered: <what was rejected and why>           │
-│                                                                     │
-│  ARCHITECTURE SMELL CHECK — hard gate (fix before commit)           │
-│    [ ] No class with >2 responsibilities (SOLID-S)                 │
-│    [ ] No behavioural class with >2 instance variables (OC-8;      │
-│        dataclasses, Pydantic models, value objects, TypedDicts      │
-│        are exempt)                                                  │
-│    [ ] All external deps assigned a Protocol (SOLID-D + Hexagonal) │
-│        N/A if no external dependencies identified in scope          │
-│    [ ] No noun with different meaning across planned modules        │
-│        (DDD Bounded Context)                                        │
-│    [ ] No missing Creational pattern: repeated construction         │
-│        without Factory/Builder                                      │
-│    [ ] No missing Structural pattern: type-switching logic          │
-│        without Strategy/Visitor                                     │
-│    [ ] No missing Behavioral pattern: state machine or scattered    │
-│        notification without State/Observer                          │
-│    [ ] Each ADR consistent with each @id AC — no contradictions    │
-│    [ ] Technically infeasible story → escalate to PO               │
-│                                                                     │
-│  commit: feat(<name>): add architecture stubs                       │
-│                                                                     │
-└─────────────────────────────────────────────────────────────────────┘
-                              ↓
-┌─────────────────────────────────────────────────────────────────────┐
-│  STEP 3 — TDD LOOP                              agent: software-engineer   │
-├─────────────────────────────────────────────────────────────────────┤
-│                                                                     │
-│  PREREQUISITES (stop if any fail — escalate to PO)                 │
-│    [ ] Architecture stubs present in <package>/ (Step 2 committed) │
-│    [ ] Read all docs/architecture/adr-NNN-*.md files               │
-│    [ ] All tests written in tests/features/<feature>/              │
-│                                                                     │
-│  Build TODO.md test list                                            │
-│    List all @id tags from in-progress .feature file                │
-│    Order: fewest dependencies first; most impactful within that    │
-│    Each @id = one TODO item, status: pending                       │
-│                                                                     │
-│  OUTER LOOP — one @id at a time                                    │
-│  ┌─────────────────────────────────────────────────────────────┐   │
-│  │  Pick next pending @id → mark in_progress in TODO.md       │   │
-│  │  (WIP limit: exactly one in_progress at all times)         │   │
-│  │                                                             │   │
-│  │  INNER LOOP                                                 │   │
-│  │  ┌───────────────────────────────────────────────────────┐ │   │
-│  │  │  RED                                                  │ │   │
-│  │  │    Read stubs in <package>/ — base test on them       │ │   │
-│  │  │    Write test body (Given/When/Then → Arrange/Act/Assert) │ │
-│  │  │    Update stub signatures as needed — edit .py directly │ │ │
-│  │  │    uv run task test-fast                              │ │   │
-│  │  │    EXIT: this @id FAILS                               │ │   │
-│  │  │    (if it passes: test is wrong — fix it first)       │ │   │
-│  │  ├───────────────────────────────────────────────────────┤ │   │
-│  │  │  GREEN                                                │ │   │
-│  │  │    Write minimum code — YAGNI + KISS only             │ │   │
-│  │  │    (no DRY, SOLID, OC here — those belong in REFACTOR)│ │   │
-│  │  │    uv run task test-fast                              │ │   │
-│  │  │    EXIT: this @id passes AND all prior tests pass     │ │   │
-│  │  │    (fix implementation only; do not advance @id)      │ │   │
-│  │  ├───────────────────────────────────────────────────────┤ │   │
-│  │  │  REFACTOR                                             │ │   │
-│  │  │    Load skill refactor — follow its protocol          │ │   │
-│  │  │    uv run task test-fast after each individual change │ │   │
-│  │  │    EXIT: test-fast passes; no smells remain           │ │   │
-│  │  ├───────────────────────────────────────────────────────┤ │   │
-│  │  │  SELF-DECLARE                                         │ │   │
-│  │  │    Fill Self-Declaration block in TODO.md             │ │   │
-│  │  │    AGREE/DISAGREE per principle with file:line        │ │   │
-│  │  │    DISAGREE requires inline justification             │ │   │
-│  │  └───────────────────────────────────────────────────────┘ │   │
-│  │                                                             │   │
-│  │  Mark @id completed in TODO.md                             │   │
-│  │  Commit when a meaningful increment is green               │   │
-│  └─────────────────────────────────────────────────────────────┘   │
-│  Repeat until all @id items completed                              │
-│                                                                     │
-│  QUALITY GATE (all @id green)                                      │
-│    uv run task lint                                                │
-│    uv run task static-check                                        │
-│    uv run task test           (coverage must be 100%)              │
-│    timeout 10s uv run task run                                     │
-│    coverage < 100%: add test in tests/unit/ for uncovered branch   │
-│      (do NOT add @id tests for coverage — @id tests are AC only)     │
-│    All must pass before Self-Declaration                           │
-│                                                                     │
-│  SELF-DECLARATION (once, after all quality gates pass)             │
-│    As a software-engineer I declare:                               │
-│      * YAGNI: no code without a failing test — AGREE/DISAGREE | file:line │
-│      * YAGNI: no speculative abstractions — AGREE/DISAGREE | file:line   │
-│      * KISS: simplest solution that passes — AGREE/DISAGREE | file:line   │
-│      * KISS: no premature optimization — AGREE/DISAGREE | file:line       │
-│      * DRY: no duplication — AGREE/DISAGREE | file:line                  │
-│      * DRY: no redundant comments — AGREE/DISAGREE | file:line            │
-│      * SOLID-S: one reason to change per class — AGREE/DISAGREE | file:line│
-│      * SOLID-O: open for extension, closed for modification        │
-│                   — AGREE/DISAGREE | file:line                            │
-│      * SOLID-L: subtypes substitutable — AGREE/DISAGREE | file:line       │
-│      * SOLID-I: no forced unused deps — AGREE/DISAGREE | file:line        │
-│      * SOLID-D: depend on abstractions, not concretions            │
-│                   — AGREE/DISAGREE | file:line                            │
-│      * OC-1: one level of indentation per method — AGREE/DISAGREE | file:line│
-│      * OC-2: no else after return — AGREE/DISAGREE | file:line            │
-│      * OC-3: primitive types wrapped — AGREE/DISAGREE | file:line        │
-│      * OC-4: first-class collections — AGREE/DISAGREE | file:line        │
-│      * OC-5: one dot per line — AGREE/DISAGREE | file:line                │
-│      * OC-6: no abbreviations — AGREE/DISAGREE | file:line                │
-│      * OC-7: ≤20 lines per function — AGREE/DISAGREE | file:line          │
-│      * OC-8: ≤2 instance variables per class (behavioural classes only; dataclasses, Pydantic models, value objects, and TypedDicts are exempt) — AGREE/DISAGREE | file:line │
-│      * OC-9: no getters/setters — AGREE/DISAGREE | file:line              │
-│      * Patterns: no creational smell — AGREE/DISAGREE | file:line         │
-│      * Patterns: no structural smell — AGREE/DISAGREE | file:line         │
-│      * Patterns: no behavioral smell — AGREE/DISAGREE | file:line         │
-│      * Semantic: tests operate at same abstraction as AC           │
-│                   — AGREE/DISAGREE | file:line                            │
-│                                                                     │
-│  → Hand off to Step 4 (Verify)                                     │
-└─────────────────────────────────────────────────────────────────────┘
-                              ↓
-┌─────────────────────────────────────────────────────────────────────┐
-│  STEP 4 — VERIFY                                  agent: reviewer   │
-├─────────────────────────────────────────────────────────────────────┤
-│                                                                     │
-│  Default hypothesis: BROKEN. Prove otherwise or REJECT.             │
-│                                                                     │
-│  4a. READ                                                           │
-│    in-progress .feature file (Rules + Examples + @id)             │
-│    Self-Declaration from software-engineer                         │
-│                                                                     │
-│  4b. pyproject.toml GATE                                           │
-│    git diff main -- pyproject.toml                                 │
-│    Any change → REJECT immediately                                 │
-│    software-engineer must revert + get stakeholder approval        │
-│                                                                     │
-│  4c. COMMIT HISTORY                                                 │
-│    git log --oneline main..HEAD                                    │
-│    All commits follow conventional commit format?                  │
-│    No "fix tests", "wip", "temp" commits?                          │
-│                                                                     │
-│  4d. COMMANDS                                                       │
-│    uv run task lint           (must exit 0)                        │
-│    uv run task static-check   (must exit 0)                        │
-│    uv run task test           (must exit 0, coverage 100%)         │
-│    timeout 10s uv run task run (exit 124 = hung = REJECT)          │
-│                                                                     │
-│  4e. PRODUCTION GATE                                                │
-│    Does the application behave as described in the feature file?   │
-│    Run manually or via integration test — not just green CI        │
-│    Input → output check for each Rule: block                       │
-│                                                                     │
-│  4f. CODE REVIEW (semantic — not covered by tooling)               │
-│    [ ] Tests operate at same abstraction level as AC              │
-│    [ ] No test asserts implementation details                      │
-│    [ ] Each @id test covers exactly one Example                   │
-│    [ ] No logic in tests (no if/for/while)                         │
-│    [ ] Module structure matches Architecture section               │
-│    [ ] No external dependency outside adapters/                   │
-│    [ ] Docstrings explain why, not what                             │
-│                                                                     │
-│  4g. SELF-DECLARATION AUDIT                                        │
-│    For every YES claim: find the file:line — does it hold?          │
-│    For every NO claim: is the deviation justified?                 │
-│    Undeclared violations → REJECT                                  │
-│                                                                     │
-│  4h. INTERACTIVE (if any doubt remains)                            │
-│    Ask software-engineer one targeted question per ambiguity        │
-│    Do not proceed to report if question is unanswered              │
-│                                                                     │
-│  4i. REPORT                                                         │
-│    APPROVED — all gates passed, no undeclared violations           │
-│    REJECTED — list each failure with file:line and required fix    │
-│                                                                     │
-│  On APPROVED → notify PO                                            │
-│  On REJECTED → return to software-engineer (Step 3 quality gate)  │
-└─────────────────────────────────────────────────────────────────────┘
-                               ↓ APPROVED
-┌─────────────────────────────────────────────────────────────────────┐
-│  STEP 5 — ACCEPT                             agent: product-owner   │
-├─────────────────────────────────────────────────────────────────────┤
-│                                                                     │
-│  PO runs/observes the feature (real user interaction)               │
-│  Checks against original Rule: user stories                         │
-│                                                                     │
-│  ACCEPTED:                                                          │
-│    mv in-progress/<name>.feature → completed/<name>.feature         │
-│    software-engineer creates PR (squash merge) + tags release               │
-│                                                                     │
-│  REJECTED:                                                          │
-│    feedback in TODO.md → back to relevant step                      │
-│                                                                     │
-└─────────────────────────────────────────────────────────────────────┘
-```
-
----
-
-## Feature File Structure
-
-Each feature is a single `.feature` file. The free-form description before the first `Rule:` contains all discovery content added progressively through the workflow:
-
-```
-Feature: <title>
-
-  Discovery:
-
-  Status: ELICITING | BASELINED (YYYY-MM-DD)
-
-  Entities:
-  | Type | Name | Candidate Class/Method | In Scope |
-
-  Rules (Business):
-  - <business rule>
-
-  Constraints:
-  - <non-functional requirement>
-
-  Session 1 — Individual Entity Elicitation:
-  | ID | Question | Answer | Status |     ← OPEN / ANSWERED
-  Template §1: PENDING | CONFIRMED
-  Synthesis: <PO synthesis — confirmed by stakeholder>
-  Pre-mortem: <gaps identified; new questions added above>
-
-  Session 2 — Behavior Groups / Big Picture:
-  | ID | Question | Answer | Status |
-  Template §2: PENDING | CONFIRMED
-  Behavior Groups:
-  - <behavior group name>: <one-sentence summary>
-
-  Session 3 — Feature Synthesis:
-  Template §3: PENDING | CONFIRMED — stakeholder approved YYYY-MM-DD
-  Synthesis: <full synthesis across all behavior groups>
-
-  Rule: <story title>
-    As a <role>
-    I want <goal>
-    So that <benefit>
-
-    @id:a3f2b1c4
-    Example: <scenario>
-      Given <context>
-      When <action>
-      Then <observable outcome>
-```
-
-Two discovery sources:
-- `docs/features/discovery.md` — project-level 3-session discovery (once per project; additive for new features)
-- Feature file description — per-feature 3-session discovery, entities, business rules, and acceptance criteria
-
----
-
-## Architecture Artifacts
-
-Architectural decisions made during Step 2 are recorded as ADR files:
-
-```
-docs/architecture/
-  adr-template.md          ← blank template — copy to create a new ADR
-  adr-001-<title>.md       ← one file per significant decision
-  adr-002-<title>.md
-  ...
-```
-
-**ADR format** (copy `adr-template.md` and fill in):
-
-```markdown
-# ADR-NNN: <title>
-
-**Status:** PROPOSED | ACCEPTED | SUPERSEDED by ADR-NNN
-
-**Decision:** <what was decided — one sentence>
-
-**Reason:** <why — one sentence>
-
-**Alternatives considered:**
-- <option>: <why rejected>
-```
-
-Write an ADR only for non-obvious decisions with real trade-offs — module boundaries, external dependency strategy, Protocol vs. concrete class, data model choices. Routine YAGNI choices do not need an ADR.
-
-Domain entity and service stubs (signatures, no bodies) live directly in the package under `<package>/domain/`, `<package>/ports/`, and `<package>/adapters/` — written at Step 2, filled in during Step 3.
-
----
-
-## Supporting Tools
-
-| Command | When | Purpose |
-|---|---|---|
-| `uv run task gen-todo` | Every session | Reads in-progress `.feature` → syncs `TODO.md` |
-| `uv run task test-fast` | Step 3 cycle | Fast test run (no coverage) — used during Red-Green-Refactor |
-| `uv run task test` | Handoff, Step 4 | Full suite with coverage — must reach 100% |
-| `uv run task lint` | Handoff, Step 4 | ruff — must exit 0 |
-| `uv run task static-check` | Handoff, Step 4 | pyright — must exit 0, 0 errors |
-| `timeout 10s uv run task run` | Handoff, Step 4 | App must exit cleanly (exit 124 = hang = fix it) |
-
----
-
-## Test Layout
-
-```
-tests/
-  features/<feature-name>/
-    <rule-slug>_test.py     ← software-engineer-written, one per Rule: block
-                              function: test_<rule_slug>_<8char_hex>()
-  unit/
-    <anything>_test.py      ← software-engineer-authored extras, no @id traceability
-                              plain pytest or Hypothesis @given (software-engineer choice)
-```
-
----
-
-## TODO.md Structure
-
-```markdown
-# Current Work
-
-Feature: <name>
-Step: <1-5> (<step name>)
-Source: docs/features/in-progress/<name>.feature
-
-## Cycle State
-Test: @id:<hex> — <description>
-Phase: RED | GREEN | REFACTOR | SELF-DECLARE
-
-## Self-Declaration
-As a software-engineer I declare:
-* YAGNI: no code without a failing test — AGREE/DISAGREE | file:line
-* YAGNI: no speculative abstractions — AGREE/DISAGREE | file:line
-* KISS: simplest solution that passes — AGREE/DISAGREE | file:line
-* KISS: no premature optimization — AGREE/DISAGREE | file:line
-* DRY: no duplication — AGREE/DISAGREE | file:line
-* DRY: no redundant comments — AGREE/DISAGREE | file:line
-* SOLID-S: one reason to change per class — AGREE/DISAGREE | file:line
-* SOLID-O: open for extension, closed for modification — AGREE/DISAGREE | file:line
-* SOLID-L: subtypes substitutable — AGREE/DISAGREE | file:line
-* SOLID-I: no forced unused deps — AGREE/DISAGREE | file:line
-* SOLID-D: depend on abstractions, not concretions — AGREE/DISAGREE | file:line
-* OC-1: one level of indentation per method — AGREE/DISAGREE | deepest: file:line
-* OC-2: no else after return — AGREE/DISAGREE | file:line
-* OC-3: primitive types wrapped — AGREE/DISAGREE | file:line
-* OC-4: first-class collections — AGREE/DISAGREE | file:line
-* OC-5: one dot per line — AGREE/DISAGREE | file:line
-* OC-6: no abbreviations — AGREE/DISAGREE | file:line
-* OC-7: ≤20 lines per function, ≤50 per class — AGREE/DISAGREE | longest: file:line
-* OC-8: ≤2 instance variables per class (behavioural classes only; dataclasses, Pydantic models, value objects, and TypedDicts are exempt) — AGREE/DISAGREE | file:line
-* OC-9: no getters/setters — AGREE/DISAGREE | file:line
-* Patterns: no creational smell — AGREE/DISAGREE | file:line
-* Patterns: no structural smell — AGREE/DISAGREE | file:line
-* Patterns: no behavioral smell — AGREE/DISAGREE | file:line
-* Semantic: tests operate at same abstraction as AC — AGREE/DISAGREE | file:line
-
-## Progress
-- [x] @id:<hex>: <done>
-- [~] @id:<hex>: <in progress>
-- [ ] @id:<hex>: <next>
-
-## Next
-<one actionable sentence>
-```
-
-`## Cycle State` is updated at every phase transition. `## Self-Declaration` is written once after all quality gates pass in Step 3. Both sections are present only during Step 3; omit when in other steps.
-
----
-
-## Roles
-
-| Role | Type | Responsibilities |
-|---|---|---|
-| **Stakeholder** | Human | Answers questions, provides domain knowledge, approves syntheses |
-| **Product Owner** | AI agent | Interviews stakeholder, writes `.feature` files, picks features, accepts deliveries |
-| **Software Engineer** | AI agent | Architecture, tests, code, git, releases |
-| **Reviewer** | AI agent | Adversarial verification — defaults to REJECTED until proven correct |
-
----
-
-## Quality Gates (non-negotiable)
-
-| Gate | Standard |
-|---|---|
-| Test coverage | 100% |
-| Type errors (pyright) | 0 |
-| Lint errors (ruff) | 0 |
-| Function length | ≤ 20 lines |
-| Class length | ≤ 50 lines |
-| Max nesting | 2 levels |
-| Instance variables per class | ≤ 2 (behavioural classes only; dataclasses, Pydantic models, value objects, TypedDicts are exempt) |
-| `noqa` comments | 0 |
-| `type: ignore` comments | 0 |
-| Orphaned tests | 0 |
-| Hypothesis tests missing `@pytest.mark.slow` | 0 |
diff --git a/pyproject.toml b/pyproject.toml
index a8dad4d..5ad2e9a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "python-project-template"
-version = "5.2.20260418"
+version = "6.2.20260419"
 description = "Python template with some awesome tools to quickstart any Python project"
 readme = "README.md"
 requires-python = ">=3.13"
@@ -23,6 +23,7 @@ Documentation = "https://github.com/nullhack/python-project-template/tree/main/d
 dev = [
     "pdoc>=14.0",
     "pytest>=9.0.3",
+    "pytest-beehave[html]>=3.0",
     "pytest-cov>=6.1.1",
     "pytest-html>=4.1.1",
     "pytest-mock>=3.14.0",
@@ -81,13 +82,13 @@ minversion = "6.0"
 markers = [
     "slow: marks tests as slow (deselect with '-m \"not slow\"')",
     "deprecated: marks tests for deprecated AC; automatically skipped (deselect with '-m \"not deprecated\"')",
+    "bug: marks tests that reproduce a reported defect (deselect with '-m \"not bug\"')",
 ]
 addopts = """
 --maxfail=10 \
 --color=yes \
 --tb=short \
 -q \
---html=docs/tests/report.html \
 """
 testpaths = ["tests"]
 python_files = ["*_test.py"]
@@ -107,8 +108,16 @@ exclude_lines = [
 
 [tool.taskipy.tasks]
 run = "python -m app"
-test-report = """\
+test-coverage = """\
 pytest \
+  --cov-config=pyproject.toml \
+  --cov=app \
+  --cov-fail-under=100 \
+  --tb=no
+"""
+test-build = """\
+pytest \
+  -p no:beehave \
   --doctest-modules \
   --cov-config=pyproject.toml \
   --cov-report html:docs/coverage \
@@ -116,12 +125,11 @@ pytest \
   --cov=app \
   --cov-fail-under=100 \
   --hypothesis-show-statistics \
+  --html=docs/tests/report.html \
+  --self-contained-html \
 """
-test = """\
-pytest -m "not slow" -q && \
-task test-report\
-"""
-test-fast = "pytest -m \"not slow\" -q"
+test = "pytest --tb=short"
+test-fast = "pytest -m \"not slow\" -q --no-header --tb=no"
 test-slow = "pytest -m slow"
 ruff-check = "ruff check . --fix"
 ruff-format = "ruff format ."
@@ -140,11 +148,12 @@ pytest \
 """
 doc-publish = "task doc-build && ghp-import -n -p -f docs"
 static-check = "pyright"
-gen-id = "python -c \"import uuid; [print(uuid.uuid4().hex[:8]) for _ in range(20)]\""
-gen-todo = "python .opencode/skills/session-workflow/scripts/gen_todo.py"
 
 [dependency-groups]
 dev = [
     "gherkin-official>=39.0.0",
     "safety>=3.7.0",
 ]
+
+[tool.beehave]
+features_path = "docs/features"
diff --git a/tests/conftest.py b/tests/conftest.py
index 9a606f7..a5c8f50 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -21,10 +21,3 @@ def pytest_html_results_table_header(cells):
 def pytest_html_results_table_row(report, cells):
     docstring = getattr(report, "docstrings", "") or ""
     cells.insert(2, f"<td style='white-space: pre-wrap;'>{docstring}</td>")
-
-
-def pytest_collection_modifyitems(items):
-    """Automatically skip tests marked as deprecated."""
-    for item in items:
-        if item.get_closest_marker("deprecated"):
-            item.add_marker(pytest.mark.skip(reason="deprecated"))
diff --git a/uv.lock b/uv.lock
index dbcc6dd..656335d 100644
--- a/uv.lock
+++ b/uv.lock
@@ -670,6 +670,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9", size = 375249, upload-time = "2026-04-07T17:16:16.13Z" },
 ]
 
+[[package]]
+name = "pytest-beehave"
+version = "3.0.20260419"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "fire" },
+    { name = "gherkin-official" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6b/45/a64788db805fc079792d28670846f8320045bd82e67ea2528f842857606b/pytest_beehave-3.0.20260419.tar.gz", hash = "sha256:bc114a0f809e3b437f09f5d42da0a36a105dc8b7b7e311410a7fdcdc915398f0", size = 28685, upload-time = "2026-04-19T19:11:15.811Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/50/24/0bfacd345c1b75497f84d83ee3a4459ec30cc4e54fc4530c376f18346ccc/pytest_beehave-3.0.20260419-py3-none-any.whl", hash = "sha256:be3843af1e8691f6023007de147b4f92a8a4ca505f94439f2df210137e746acd", size = 30323, upload-time = "2026-04-19T19:11:14.168Z" },
+]
+
+[package.optional-dependencies]
+html = [
+    { name = "pytest-html" },
+]
+
 [[package]]
 name = "pytest-cov"
 version = "6.1.1"
@@ -735,7 +753,7 @@ wheels = [
 
 [[package]]
 name = "python-project-template"
-version = "5.2.20260418"
+version = "6.2.20260419"
 source = { virtual = "." }
 dependencies = [
     { name = "fire" },
@@ -748,6 +766,7 @@ dev = [
     { name = "pdoc" },
     { name = "pyright" },
     { name = "pytest" },
+    { name = "pytest-beehave", extra = ["html"] },
     { name = "pytest-cov" },
     { name = "pytest-html" },
     { name = "pytest-mock" },
@@ -769,6 +788,7 @@ requires-dist = [
     { name = "pdoc", marker = "extra == 'dev'", specifier = ">=14.0" },
     { name = "pyright", marker = "extra == 'dev'", specifier = ">=1.1.407" },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.3" },
+    { name = "pytest-beehave", extras = ["html"], marker = "extra == 'dev'", specifier = ">=3.0" },
     { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=6.1.1" },
     { name = "pytest-html", marker = "extra == 'dev'", specifier = ">=4.1.1" },
     { name = "pytest-mock", marker = "extra == 'dev'", specifier = ">=3.14.0" },