Prompt Caching Demo

Chat with Claude — cache metrics update after each response
Implementation:
demos/agno-prompt-caching/main.py225 lines
1"""
2Agno prompt caching demo.
3
4Demonstrates four caching rules for Agno agents backed by claude-sonnet-4-20250514:
5  Rule 1 — add_datetime_to_instructions=False, cache_system_prompt=True
6  Rule 2 — all tools passed once at Agent() construction
7  Rule 3 — dynamic context injected via <system-reminder> in user message
8  Rule 4 — compaction clones agent with same description/instructions/tools
9"""
10
11import os
12import datetime
13from agno.agent import Agent
14from agno.models.anthropic import Claude
15
16
17# ---------------------------------------------------------------------------
18# Mock tools — defined once, never changed
19# ---------------------------------------------------------------------------
20
21def search_docs(query: str) -> str:
22    """Search internal documentation."""
23    return f"[search_docs] Found 3 results for '{query}': doc_a.md, doc_b.md, doc_c.md"
24
25
26def read_file(path: str) -> str:
27    """Read a file from the project."""
28    return f"[read_file] Contents of {path}: <mock file content for {path}>"
29
30
31def write_file(path: str, content: str) -> str:
32    """Write content to a file."""
33    return f"[write_file] Wrote {len(content)} bytes to {path}"
34
35
36def run_tests(suite: str = "all") -> str:
37    """Run the test suite."""
38    return f"[run_tests] Suite '{suite}': 42 passed, 0 failed"
39
40
41def enter_plan_mode(goal: str) -> str:
42    """Enter structured planning mode for a goal."""
43    return f"[enter_plan_mode] Planning mode activated for: {goal}"
44
45
46def exit_plan_mode(summary: str) -> str:
47    """Exit planning mode with a summary."""
48    return f"[exit_plan_mode] Plan committed: {summary}"
49
50
51ALL_TOOLS = [search_docs, read_file, write_file, run_tests, enter_plan_mode, exit_plan_mode]
52
53SYSTEM_INSTRUCTIONS = """You are an expert software engineering assistant.
54You help developers understand codebases, write code, run tests, and plan work.
55
56You have access to tools for searching documentation, reading and writing files,
57running tests, and entering structured planning mode.
58
59Always reason step by step. Prefer reading before writing. Run tests after changes."""
60
61
62# ---------------------------------------------------------------------------
63# Agent factory — Rule 1 and Rule 2
64# ---------------------------------------------------------------------------
65
66def create_agent() -> Agent:
67    """
68    Rule 1: add_datetime_to_instructions=False keeps the system prompt stable
69            so Anthropic can cache it. cache_system_prompt=True tells Agno to
70            send cache_control breakpoints on the system prompt.
71    Rule 2: All tools passed once at construction. Never mutate agent.tools
72            mid-session — that breaks the cache prefix.
73    """
74    return Agent(
75        model=Claude(
76            id="claude-sonnet-4-20250514",
77        ),
78        description="Expert software engineering assistant",
79        instructions=SYSTEM_INSTRUCTIONS,
80        tools=ALL_TOOLS,
81        add_datetime_to_instructions=False,  # Rule 1
82        cache_system_prompt=True,            # Rule 1
83        markdown=False,
84    )
85
86
87# ---------------------------------------------------------------------------
88# Rule 3 — inject dynamic context into the user message
89# ---------------------------------------------------------------------------
90
91def build_message(user_input: str) -> str:
92    """
93    Wrap dynamic runtime context (timestamp, cwd, env) in a <system-reminder>
94    block prepended to the user message. This keeps the cacheable system prompt
95    clean while still giving the model fresh context each turn.
96    """
97    now = datetime.datetime.utcnow().isoformat(timespec="seconds") + "Z"
98    reminder = (
99        f"<system-reminder>\n"
100        f"current_time={now}\n"
101        f"cwd=/home/user/project\n"
102        f"git_branch=main\n"
103        f"</system-reminder>\n\n"
104    )
105    return reminder + user_input
106
107
108# ---------------------------------------------------------------------------
109# Rule 4 — compaction preserves the cache prefix
110# ---------------------------------------------------------------------------
111
112def compact_history(agent: Agent) -> Agent:
113    """
114    Rather than appending a growing history, clone the agent with the same
115    description, instructions, and tools so the system-prompt cache prefix is
116    preserved. The conversation history is dropped (compacted).
117    """
118    return Agent(
119        model=Claude(id="claude-sonnet-4-20250514"),
120        description=agent.description,
121        instructions=SYSTEM_INSTRUCTIONS,
122        tools=ALL_TOOLS,
123        add_datetime_to_instructions=False,
124        cache_system_prompt=True,
125        markdown=False,
126    )
127
128
129# ---------------------------------------------------------------------------
130# Metrics printer
131# ---------------------------------------------------------------------------
132
133def print_metrics(turn: int, response) -> None:
134    metrics = getattr(response, "metrics", None) or {}
135
136    if isinstance(metrics, dict):
137        cache_read = metrics.get("cache_read_input_tokens", [0])
138        cache_write = metrics.get("cache_creation_input_tokens", [0])
139        input_tokens = metrics.get("input_tokens", [0])
140
141        # Agno stores metrics as lists (one entry per LLM call)
142        cache_read = sum(cache_read) if isinstance(cache_read, list) else cache_read
143        cache_write = sum(cache_write) if isinstance(cache_write, list) else cache_write
144        input_tokens = sum(input_tokens) if isinstance(input_tokens, list) else input_tokens
145    else:
146        cache_read = getattr(metrics, "cache_read_input_tokens", 0) or 0
147        cache_write = getattr(metrics, "cache_creation_input_tokens", 0) or 0
148        input_tokens = getattr(metrics, "input_tokens", 0) or 0
149
150    total = cache_read + cache_write + input_tokens
151    hit_rate = cache_read / total if total > 0 else 0.0
152
153    # Rough cost: cache_write=$3.75/Mtok, cache_read=$0.30/Mtok, uncached=$3/Mtok
154    cost = (cache_write * 3.75 + cache_read * 0.30 + input_tokens * 3.0) / 1_000_000
155
156    print(f"\n  [Turn {turn} metrics]")
157    print(f"    cache_write : {cache_write:>6} tokens")
158    print(f"    cache_read  : {cache_read:>6} tokens")
159    print(f"    uncached    : {input_tokens:>6} tokens")
160    print(f"    hit_rate    : {hit_rate:.1%}")
161    print(f"    est. cost   : ${cost:.4f}")
162
163    if turn > 1 and hit_rate < 0.8:
164        print(f"  !! WARNING: hit_rate {hit_rate:.1%} below 80% on turn {turn}. "
165              "Check for cache-busting (timestamps, tool mutations, model switches).")
166
167
168# ---------------------------------------------------------------------------
169# Main
170# ---------------------------------------------------------------------------
171
172PROMPTS = [
173    "Search the docs for information about our authentication system.",
174    "Read the file src/auth/middleware.py and explain what it does.",
175    "The middleware looks fine. Run the test suite to confirm everything passes.",
176    "Enter plan mode: refactor the auth middleware to support OAuth2.",
177]
178
179
180def main() -> None:
181    print("=" * 60)
182    print("Agno Prompt Caching Demo")
183    print("Model: claude-sonnet-4-20250514")
184    print("=" * 60)
185
186    agent = create_agent()
187
188    for turn, prompt in enumerate(PROMPTS, start=1):
189        print(f"\n--- Turn {turn} ---")
190        message = build_message(prompt)  # Rule 3
191        print(f"  User: {prompt[:80]}...")
192
193        response = agent.run(message)
194
195        content = getattr(response, "content", "") or ""
196        if isinstance(content, list):
197            content = " ".join(
198                getattr(block, "text", str(block)) for block in content
199            )
200        print(f"  Agent: {str(content)[:120]}...")
201        print_metrics(turn, response)
202
203    # Rule 4: compaction demo
204    print("\n" + "=" * 60)
205    print("Compaction demo (Rule 4)")
206    print("=" * 60)
207    compacted = compact_history(agent)
208    followup = build_message("What is the status of the OAuth2 refactor plan?")
209    print("\n--- Turn 5 (post-compaction) ---")
210    response = compacted.run(followup)
211    content = getattr(response, "content", "") or ""
212    if isinstance(content, list):
213        content = " ".join(getattr(b, "text", str(b)) for b in content)
214    print(f"  Agent: {str(content)[:120]}...")
215    print_metrics(5, response)
216    print("\nNote: cache_write may spike again after compaction (new session).")
217    print("      But system prompt + tools prefix is preserved, so subsequent")
218    print("      turns in the compacted agent will hit cache again.")
219
220
221if __name__ == "__main__":
222    if not os.environ.get("ANTHROPIC_API_KEY"):
223        raise SystemExit("Set ANTHROPIC_API_KEY before running this demo.")
224    main()
225
Ask anything. Turn 1 shows 0% cache hits.
Ask a follow-up and watch the hit rate climb.
Press Enter to send