From 318166f8b0f58c0d2ced5ef200156a1b6c96d7d4 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Wed, 3 Dec 2025 09:40:05 +0000
Subject: [PATCH] Verbose: send tool result metadata only

---
 CHANGELOG.md                    |  2 +-
 README.md                       |  2 +-
 docs/thinking.md                |  2 +-
 src/agents/agents.test.ts       |  9 ++++
 src/agents/pi.ts                | 35 +++++++++++++--
 src/agents/types.ts             |  7 ++-
 src/auto-reply/command-reply.ts | 77 ++++++++++++++++++++++++---------
 src/index.core.test.ts          |  5 +--
 8 files changed, 108 insertions(+), 31 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b47b0b2f7..dad0d1935 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,7 +4,7 @@
 
 ### Highlights
 - **Thinking directives & state:** `/t|/think|/thinking <level>` (aliases off|minimal|low|medium|high|max/highest). Inline applies to that message; directive-only message pins the level for the session; `/think:off` clears. Resolution: inline > session override > `inbound.reply.thinkingDefault` > off. Pi/Tau get `--thinking <level>` (except off); other agents append cue words (`think` → `think hard` → `think harder` → `ultrathink`). Heartbeat probe uses `HEARTBEAT /think:high`.
-- **Verbose directives + session hints:** `/v|/verbose on|full|off` mirrors thinking: inline > session > config default. Directive-only replies with an acknowledgement; invalid levels return a hint. When enabled, tool results from JSON-emitting agents (Pi/Tau, etc.) are forwarded as `[🛠️ <tool-name>] …` messages (now streamed as they happen), and new sessions surface a `🧭 New session: <id>` hint.
+- **Verbose directives + session hints:** `/v|/verbose on|full|off` mirrors thinking: inline > session > config default. Directive-only replies with an acknowledgement; invalid levels return a hint. When enabled, tool results from JSON-emitting agents (Pi/Tau, etc.) are forwarded as metadata-only `[🛠️ <tool-name>]` messages (now streamed as they happen), and new sessions surface a `🧭 New session: <id>` hint.
 - **Directive confirmations:** Directive-only messages now reply with an acknowledgement (`Thinking level set to high.` / `Thinking disabled.`) and reject unknown levels with a helpful hint (state is unchanged).
 - **Pi/Tau stability:** RPC replies buffered until the assistant turn finishes; parsers return consistent `texts[]`; web auto-replies keep a warm Tau RPC process to avoid cold starts.
 - **Claude prompt flow:** One-time `sessionIntro` with per-message `/think:high` bodyPrefix; system prompt always sent on first turn even with `sendSystemOnce`.
diff --git a/README.md b/README.md
index 5470a0aa5..b5f3d86dc 100644
--- a/README.md
+++ b/README.md
@@ -166,7 +166,7 @@ warelay supports running on the same phone number you message from—you chat wi
 - Levels: `on|full` (same) or `off` (default). Use `/v on`, `/verbose:full`, `/v off`, etc.; colon optional.
 - Directive-only message sets a session-level verbose flag (`Verbose logging enabled./disabled.`); invalid levels reply with a hint and don’t change state.
 - Inline directive applies only to that message; resolution: inline > session default > `inbound.reply.verboseDefault` (config) > off.
-- When verbose is on **and the agent emits structured tool results (Pi/Tau and other JSON-emitting agents)**, tool results are sent back as separate messages prefixed with `🛠️`.
+- When verbose is on **and the agent emits structured tool results (Pi/Tau and other JSON-emitting agents)**, only tool metadata is forwarded: each tool result becomes `[🛠️ <tool-name>]` (output/body is not inlined).
 - Starting a new session while verbose is on adds a first reply like `🧭 New session: <id>` so you can correlate runs.
 
 ### Logging (optional)
diff --git a/docs/thinking.md b/docs/thinking.md
index 311608a9f..fbe0d06aa 100644
--- a/docs/thinking.md
+++ b/docs/thinking.md
@@ -28,7 +28,7 @@
 - Levels: `on|full` or `off` (default).
 - Directive-only message toggles session verbose and replies `Verbose logging enabled.` / `Verbose logging disabled.`; invalid levels return a hint without changing state.
 - Inline directive affects only that message; session/global defaults apply otherwise.
-- When verbose is on, agents that emit structured tool results (Pi/Tau, other JSON agents) send each tool result back as its own message, prefixed with `🛠️`.
+- When verbose is on, agents that emit structured tool results (Pi/Tau, other JSON agents) send each tool result back as its own metadata-only message, prefixed with `[🛠️ <tool-name>]` (the tool output itself is not forwarded).
 
 ## Heartbeats
 - Heartbeat probe body is `HEARTBEAT /think:high`, so it always asks for max thinking on the probe. Inline directive wins; session/global defaults are used only when no directive is present.
diff --git a/src/agents/agents.test.ts b/src/agents/agents.test.ts
index da40b2109..9a6be1e74 100644
--- a/src/agents/agents.test.ts
+++ b/src/agents/agents.test.ts
@@ -67,6 +67,15 @@ describe("agent buildArgs + parseOutput helpers", () => {
     expect((parsed.meta?.usage as { output?: number })?.output).toBe(5);
   });
 
+  it("piSpec carries tool names when present", () => {
+    const stdout =
+      '{"type":"message_end","message":{"role":"tool_result","name":"bash","content":[{"type":"text","text":"ls output"}]}}';
+    const parsed = piSpec.parseOutput(stdout);
+    const tool = parsed.toolResults?.[0] as { text?: string; toolName?: string };
+    expect(tool?.text).toBe("ls output");
+    expect(tool?.toolName).toBe("bash");
+  });
+
   it("codexSpec parses agent_message and aggregates usage", () => {
     const stdout = [
       '{"type":"item.completed","item":{"type":"agent_message","text":"hi there"}}',
diff --git a/src/agents/pi.ts b/src/agents/pi.ts
index e8569c85a..2687b1c2d 100644
--- a/src/agents/pi.ts
+++ b/src/agents/pi.ts
@@ -1,6 +1,11 @@
 import path from "node:path";
 
-import type { AgentMeta, AgentParseResult, AgentSpec } from "./types.js";
+import type {
+  AgentMeta,
+  AgentParseResult,
+  AgentSpec,
+  AgentToolResult,
+} from "./types.js";
 
 type PiAssistantMessage = {
   role?: string;
@@ -9,15 +14,37 @@ type PiAssistantMessage = {
   model?: string;
   provider?: string;
   stopReason?: string;
+  name?: string;
+  toolName?: string;
+  tool_call_id?: string;
   toolCallId?: string;
 };
 
+function inferToolName(msg: PiAssistantMessage): string | undefined {
+  const candidates = [
+    msg.toolName,
+    msg.name,
+    msg.toolCallId,
+    msg.tool_call_id,
+  ]
+    .map((c) => (typeof c === "string" ? c.trim() : ""))
+    .filter(Boolean);
+  if (candidates.length) return candidates[0];
+
+  if (msg.role && msg.role.includes(":")) {
+    const suffix = msg.role.split(":").slice(1).join(":").trim();
+    if (suffix) return suffix;
+  }
+
+  return undefined;
+}
+
 function parsePiJson(raw: string): AgentParseResult {
   const lines = raw.split(/\n+/).filter((l) => l.trim().startsWith("{"));
 
   // Collect only completed assistant messages (skip streaming updates/toolcalls).
   const texts: string[] = [];
-  const toolResults: string[] = [];
+  const toolResults: AgentToolResult[] = [];
   let lastAssistant: PiAssistantMessage | undefined;
   let lastPushed: string | undefined;
 
@@ -59,7 +86,9 @@ function parsePiJson(raw: string): AgentParseResult {
           .map((c) => c.text)
           .join("\n")
           .trim();
-        if (toolText) toolResults.push(toolText);
+        if (toolText) {
+          toolResults.push({ text: toolText, toolName: inferToolName(msg) });
+        }
       }
     } catch {
       // ignore malformed lines
diff --git a/src/agents/types.ts b/src/agents/types.ts
index 7d16b2b47..f826188d6 100644
--- a/src/agents/types.ts
+++ b/src/agents/types.ts
@@ -15,11 +15,16 @@ export type AgentMeta = {
   extra?: Record<string, unknown>;
 };
 
+export type AgentToolResult = {
+  text: string;
+  toolName?: string;
+};
+
 export type AgentParseResult = {
   // Plural to support agents that emit multiple assistant turns per prompt.
   texts?: string[];
   mediaUrls?: string[];
-  toolResults?: string[];
+  toolResults?: Array<string | AgentToolResult>;
   meta?: AgentMeta;
 };
 
diff --git a/src/auto-reply/command-reply.ts b/src/auto-reply/command-reply.ts
index e1397219e..b40315599 100644
--- a/src/auto-reply/command-reply.ts
+++ b/src/auto-reply/command-reply.ts
@@ -2,7 +2,7 @@ import fs from "node:fs/promises";
 import path from "node:path";
 
 import { type AgentKind, getAgentSpec } from "../agents/index.js";
-import type { AgentMeta } from "../agents/types.js";
+import type { AgentMeta, AgentToolResult } from "../agents/types.js";
 import type { WarelayConfig } from "../config/config.js";
 import { isVerbose, logVerbose } from "../globals.js";
 import { logError } from "../logger.js";
@@ -53,6 +53,51 @@ export type CommandReplyResult = {
   meta: CommandReplyMeta;
 };
 
+type ToolMessageLike = {
+  name?: string;
+  toolName?: string;
+  tool_call_id?: string;
+  toolCallId?: string;
+  role?: string;
+};
+
+function inferToolName(message?: ToolMessageLike): string | undefined {
+  if (!message) return undefined;
+  const candidates = [
+    message.toolName,
+    message.name,
+    message.toolCallId,
+    message.tool_call_id,
+  ]
+    .map((c) => (typeof c === "string" ? c.trim() : ""))
+    .filter(Boolean);
+  if (candidates.length) return candidates[0];
+
+  if (message.role && message.role.includes(":")) {
+    const suffix = message.role.split(":").slice(1).join(":").trim();
+    if (suffix) return suffix;
+  }
+  return undefined;
+}
+
+function normalizeToolResults(
+  toolResults?: Array<string | AgentToolResult>,
+): AgentToolResult[] {
+  if (!toolResults) return [];
+  return toolResults
+    .map((tr) => (typeof tr === "string" ? { text: tr } : tr))
+    .map((tr) => ({
+      text: (tr.text ?? "").trim(),
+      toolName: tr.toolName?.trim() || undefined,
+    }))
+    .filter((tr) => tr.text.length > 0);
+}
+
+function formatToolPrefix(toolName?: string) {
+  const label = toolName?.trim() || "tool";
+  return `[🛠️ ${label}]`;
+}
+
 export function summarizeClaudeMetadata(payload: unknown): string | undefined {
   if (!payload || typeof payload !== "object") return undefined;
   const obj = payload as Record<string, unknown>;
@@ -289,23 +334,14 @@ export async function runCommandReply(
                       ev.message?.role === "tool_result" &&
                       Array.isArray(ev.message.content)
                     ) {
-                      const text = (
-                        ev.message.content as Array<{ text?: string }>
-                      )
-                        .map((c) => c.text)
-                        .filter((t): t is string => !!t)
-                        .join("\n")
-                        .trim();
-                      if (text) {
-                        const { text: cleanedText, mediaUrls: mediaFound } =
-                          splitMediaFromOutput(`🛠️ ${text}`);
-                        void onPartialReply({
-                          text: cleanedText,
-                          mediaUrls: mediaFound?.length
-                            ? mediaFound
-                            : undefined,
-                        } as ReplyPayload);
-                      }
+                      const toolName = inferToolName(ev.message);
+                      const prefix = formatToolPrefix(toolName);
+                      const { text: cleanedText, mediaUrls: mediaFound } =
+                        splitMediaFromOutput(prefix);
+                      void onPartialReply({
+                        text: cleanedText,
+                        mediaUrls: mediaFound?.length ? mediaFound : undefined,
+                      } as ReplyPayload);
                     }
                   } catch {
                     // ignore malformed lines
@@ -341,8 +377,7 @@ export async function runCommandReply(
     // Collect assistant texts and tool results from parseOutput (tau RPC can emit many).
     const parsedTexts =
       parsed?.texts?.map((t) => t.trim()).filter(Boolean) ?? [];
-    const parsedToolResults =
-      parsed?.toolResults?.map((t) => t.trim()).filter(Boolean) ?? [];
+    const parsedToolResults = normalizeToolResults(parsed?.toolResults);
 
     type ReplyItem = { text: string; media?: string[] };
     const replyItems: ReplyItem[] = [];
@@ -352,7 +387,7 @@ export async function runCommandReply(
 
     if (includeToolResultsInline) {
       for (const tr of parsedToolResults) {
-        const prefixed = `🛠️ ${tr}`;
+        const prefixed = formatToolPrefix(tr.toolName);
         const { text: cleanedText, mediaUrls: mediaFound } =
           splitMediaFromOutput(prefixed);
         replyItems.push({
diff --git a/src/index.core.test.ts b/src/index.core.test.ts
index c2bf466ce..78743c4a7 100644
--- a/src/index.core.test.ts
+++ b/src/index.core.test.ts
@@ -719,7 +719,7 @@ describe("config and templating", () => {
     const rpcSpy = vi.spyOn(tauRpc, "runPiRpc").mockResolvedValue({
       stdout:
         '{"type":"message","message":{"role":"assistant","content":[{"type":"text","text":"summary"}]}}\n' +
-        '{"type":"message_end","message":{"role":"tool_result","content":[{"type":"text","text":"ls output"}]}}',
+        '{"type":"message_end","message":{"role":"tool_result","name":"bash","content":[{"type":"text","text":"ls output"}]}}',
       stderr: "",
       code: 0,
       signal: null,
@@ -744,8 +744,7 @@ describe("config and templating", () => {
     expect(rpcSpy).toHaveBeenCalled();
     const payloads = Array.isArray(res) ? res : res ? [res] : [];
     expect(payloads.length).toBeGreaterThanOrEqual(2);
-    expect(payloads[0]?.text).toContain("🛠️");
-    expect(payloads[0]?.text).toContain("ls output");
+    expect(payloads[0]?.text).toBe("[🛠️ bash]");
     expect(payloads[1]?.text).toContain("summary");
   });