Prompt Caching Demo

Chat with Claude — cache metrics update after each response

Implementation:
demos/agno-prompt-caching/main.py225 lines
1"""
2Agno prompt caching demo.
3
4Demonstrates four caching rules for Agno agents backed by claude-sonnet-4-20250514:
5 Rule 1 — add_datetime_to_instructions=False, cache_system_prompt=True
6 Rule 2 — all tools passed once at Agent() construction
7 Rule 3 — dynamic context injected via <system-reminder> in user message
8 Rule 4 — compaction clones agent with same description/instructions/tools
9"""
10
11import os
12import datetime
13from agno.agent import Agent
14from agno.models.anthropic import Claude
15
16
17# ---------------------------------------------------------------------------
18# Mock tools — defined once, never changed
19# ---------------------------------------------------------------------------
20
21def search_docs(query: str) -> str:
22 """Search internal documentation."""
23 return f"[search_docs] Found 3 results for '{query}': doc_a.md, doc_b.md, doc_c.md"
24
25
26def read_file(path: str) -> str:
27 """Read a file from the project."""
28 return f"[read_file] Contents of {path}: <mock file content for {path}>"
29
30
31def write_file(path: str, content: str) -> str:
32 """Write content to a file."""
33 return f"[write_file] Wrote {len(content)} bytes to {path}"
34
35
36def run_tests(suite: str = "all") -> str:
37 """Run the test suite."""
38 return f"[run_tests] Suite '{suite}': 42 passed, 0 failed"
39
40
41def enter_plan_mode(goal: str) -> str:
42 """Enter structured planning mode for a goal."""
43 return f"[enter_plan_mode] Planning mode activated for: {goal}"
44
45
46def exit_plan_mode(summary: str) -> str:
47 """Exit planning mode with a summary."""
48 return f"[exit_plan_mode] Plan committed: {summary}"
49
50
51ALL_TOOLS = [search_docs, read_file, write_file, run_tests, enter_plan_mode, exit_plan_mode]
52
53SYSTEM_INSTRUCTIONS = """You are an expert software engineering assistant.
54You help developers understand codebases, write code, run tests, and plan work.
55
56You have access to tools for searching documentation, reading and writing files,
57running tests, and entering structured planning mode.
58
59Always reason step by step. Prefer reading before writing. Run tests after changes."""
60
61
62# ---------------------------------------------------------------------------
63# Agent factory — Rule 1 and Rule 2
64# ---------------------------------------------------------------------------
65
66def create_agent() -> Agent:
67 """
68 Rule 1: add_datetime_to_instructions=False keeps the system prompt stable
69 so Anthropic can cache it. cache_system_prompt=True tells Agno to
70 send cache_control breakpoints on the system prompt.
71 Rule 2: All tools passed once at construction. Never mutate agent.tools
72 mid-session — that breaks the cache prefix.
73 """
74 return Agent(
75 model=Claude(
76 id="claude-sonnet-4-20250514",
77 ),
78 description="Expert software engineering assistant",
79 instructions=SYSTEM_INSTRUCTIONS,
80 tools=ALL_TOOLS,
81 add_datetime_to_instructions=False, # Rule 1
82 cache_system_prompt=True, # Rule 1
83 markdown=False,
84 )
85
86
87# ---------------------------------------------------------------------------
88# Rule 3 — inject dynamic context into the user message
89# ---------------------------------------------------------------------------
90
91def build_message(user_input: str) -> str:
92 """
93 Wrap dynamic runtime context (timestamp, cwd, env) in a <system-reminder>
94 block prepended to the user message. This keeps the cacheable system prompt
95 clean while still giving the model fresh context each turn.
96 """
97 now = datetime.datetime.utcnow().isoformat(timespec="seconds") + "Z"
98 reminder = (
99 f"<system-reminder>\n"
100 f"current_time={now}\n"
101 f"cwd=/home/user/project\n"
102 f"git_branch=main\n"
103 f"</system-reminder>\n\n"
104 )
105 return reminder + user_input
106
107
108# ---------------------------------------------------------------------------
109# Rule 4 — compaction preserves the cache prefix
110# ---------------------------------------------------------------------------
111
112def compact_history(agent: Agent) -> Agent:
113 """
114 Rather than appending a growing history, clone the agent with the same
115 description, instructions, and tools so the system-prompt cache prefix is
116 preserved. The conversation history is dropped (compacted).
117 """
118 return Agent(
119 model=Claude(id="claude-sonnet-4-20250514"),
120 description=agent.description,
121 instructions=SYSTEM_INSTRUCTIONS,
122 tools=ALL_TOOLS,
123 add_datetime_to_instructions=False,
124 cache_system_prompt=True,
125 markdown=False,
126 )
127
128
129# ---------------------------------------------------------------------------
130# Metrics printer
131# ---------------------------------------------------------------------------
132
133def print_metrics(turn: int, response) -> None:
134 metrics = getattr(response, "metrics", None) or {}
135
136 if isinstance(metrics, dict):
137 cache_read = metrics.get("cache_read_input_tokens", [0])
138 cache_write = metrics.get("cache_creation_input_tokens", [0])
139 input_tokens = metrics.get("input_tokens", [0])
140
141 # Agno stores metrics as lists (one entry per LLM call)
142 cache_read = sum(cache_read) if isinstance(cache_read, list) else cache_read
143 cache_write = sum(cache_write) if isinstance(cache_write, list) else cache_write
144 input_tokens = sum(input_tokens) if isinstance(input_tokens, list) else input_tokens
145 else:
146 cache_read = getattr(metrics, "cache_read_input_tokens", 0) or 0
147 cache_write = getattr(metrics, "cache_creation_input_tokens", 0) or 0
148 input_tokens = getattr(metrics, "input_tokens", 0) or 0
149
150 total = cache_read + cache_write + input_tokens
151 hit_rate = cache_read / total if total > 0 else 0.0
152
153 # Rough cost: cache_write=$3.75/Mtok, cache_read=$0.30/Mtok, uncached=$3/Mtok
154 cost = (cache_write * 3.75 + cache_read * 0.30 + input_tokens * 3.0) / 1_000_000
155
156 print(f"\n [Turn {turn} metrics]")
157 print(f" cache_write : {cache_write:>6} tokens")
158 print(f" cache_read : {cache_read:>6} tokens")
159 print(f" uncached : {input_tokens:>6} tokens")
160 print(f" hit_rate : {hit_rate:.1%}")
161 print(f" est. cost : ${cost:.4f}")
162
163 if turn > 1 and hit_rate < 0.8:
164 print(f" !! WARNING: hit_rate {hit_rate:.1%} below 80% on turn {turn}. "
165 "Check for cache-busting (timestamps, tool mutations, model switches).")
166
167
168# ---------------------------------------------------------------------------
169# Main
170# ---------------------------------------------------------------------------
171
172PROMPTS = [
173 "Search the docs for information about our authentication system.",
174 "Read the file src/auth/middleware.py and explain what it does.",
175 "The middleware looks fine. Run the test suite to confirm everything passes.",
176 "Enter plan mode: refactor the auth middleware to support OAuth2.",
177]
178
179
180def main() -> None:
181 print("=" * 60)
182 print("Agno Prompt Caching Demo")
183 print("Model: claude-sonnet-4-20250514")
184 print("=" * 60)
185
186 agent = create_agent()
187
188 for turn, prompt in enumerate(PROMPTS, start=1):
189 print(f"\n--- Turn {turn} ---")
190 message = build_message(prompt) # Rule 3
191 print(f" User: {prompt[:80]}...")
192
193 response = agent.run(message)
194
195 content = getattr(response, "content", "") or ""
196 if isinstance(content, list):
197 content = " ".join(
198 getattr(block, "text", str(block)) for block in content
199 )
200 print(f" Agent: {str(content)[:120]}...")
201 print_metrics(turn, response)
202
203 # Rule 4: compaction demo
204 print("\n" + "=" * 60)
205 print("Compaction demo (Rule 4)")
206 print("=" * 60)
207 compacted = compact_history(agent)
208 followup = build_message("What is the status of the OAuth2 refactor plan?")
209 print("\n--- Turn 5 (post-compaction) ---")
210 response = compacted.run(followup)
211 content = getattr(response, "content", "") or ""
212 if isinstance(content, list):
213 content = " ".join(getattr(b, "text", str(b)) for b in content)
214 print(f" Agent: {str(content)[:120]}...")
215 print_metrics(5, response)
216 print("\nNote: cache_write may spike again after compaction (new session).")
217 print(" But system prompt + tools prefix is preserved, so subsequent")
218 print(" turns in the compacted agent will hit cache again.")
219
220
221if __name__ == "__main__":
222 if not os.environ.get("ANTHROPIC_API_KEY"):
223 raise SystemExit("Set ANTHROPIC_API_KEY before running this demo.")
224 main()
225

Ask anything. Turn 1 shows 0% cache hits.
Ask a follow-up and watch the hit rate climb.

Press Enter to send