From 42948b70e31157388a374d3ae5737f2259ef8bd7 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Wed, 17 Dec 2025 15:29:00 +0000 Subject: [PATCH] fix(pi): harden image read mime --- src/agents/pi-embedded-utils.ts | 44 +++++++++++ src/agents/pi-embedded.ts | 51 ++----------- src/agents/pi-tools.test.ts | 32 ++++++++ src/agents/pi-tools.ts | 127 ++++++++++++++++++++++++++++++++ 4 files changed, 209 insertions(+), 45 deletions(-) create mode 100644 src/agents/pi-embedded-utils.ts create mode 100644 src/agents/pi-tools.test.ts create mode 100644 src/agents/pi-tools.ts diff --git a/src/agents/pi-embedded-utils.ts b/src/agents/pi-embedded-utils.ts new file mode 100644 index 000000000..a571a9047 --- /dev/null +++ b/src/agents/pi-embedded-utils.ts @@ -0,0 +1,44 @@ +import type { AssistantMessage } from "@mariozechner/pi-ai"; + +export function extractAssistantText(msg: AssistantMessage): string { + const isTextBlock = ( + block: unknown, + ): block is { type: "text"; text: string } => { + if (!block || typeof block !== "object") return false; + const rec = block as Record; + return rec.type === "text" && typeof rec.text === "string"; + }; + + const blocks = Array.isArray(msg.content) + ? msg.content + .filter(isTextBlock) + .map((c) => c.text.trim()) + .filter(Boolean) + : []; + return blocks.join("\n").trim(); +} + +export function inferToolMetaFromArgs( + toolName: string, + args: unknown, +): string | undefined { + if (!args || typeof args !== "object") return undefined; + const record = args as Record; + + const p = typeof record.path === "string" ? record.path : undefined; + const command = + typeof record.command === "string" ? record.command : undefined; + + if (toolName === "read" && p) { + const offset = + typeof record.offset === "number" ? record.offset : undefined; + const limit = typeof record.limit === "number" ? record.limit : undefined; + if (offset !== undefined && limit !== undefined) { + return `${p}:${offset}-${offset + limit}`; + } + return p; + } + if ((toolName === "edit" || toolName === "write") && p) return p; + if (toolName === "bash" && command) return command; + return p ?? command; +} diff --git a/src/agents/pi-embedded.ts b/src/agents/pi-embedded.ts index 8a7c42ee9..29195bdb3 100644 --- a/src/agents/pi-embedded.ts +++ b/src/agents/pi-embedded.ts @@ -19,7 +19,6 @@ import { } from "@mariozechner/pi-ai"; import { AgentSession, - codingTools, messageTransformer, SessionManager, SettingsManager, @@ -34,7 +33,12 @@ import { splitMediaFromOutput } from "../media/parse.js"; import { enqueueCommand } from "../process/command-queue.js"; import { resolveUserPath } from "../utils.js"; import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js"; +import { + extractAssistantText, + inferToolMetaFromArgs, +} from "./pi-embedded-utils.js"; import { getAnthropicOAuthToken } from "./pi-oauth.js"; +import { createClawdisCodingTools } from "./pi-tools.js"; import { buildAgentSystemPrompt } from "./system-prompt.js"; import { loadWorkspaceBootstrapFiles } from "./workspace.js"; @@ -86,49 +90,6 @@ function resolveModel( return model as Model | undefined; } -function extractAssistantText(msg: AssistantMessage): string { - const isTextBlock = ( - block: unknown, - ): block is { type: "text"; text: string } => { - if (!block || typeof block !== "object") return false; - const rec = block as Record; - return rec.type === "text" && typeof rec.text === "string"; - }; - - const blocks = Array.isArray(msg.content) - ? msg.content - .filter(isTextBlock) - .map((c) => c.text.trim()) - .filter(Boolean) - : []; - return blocks.join("\n").trim(); -} - -function inferToolMetaFromArgs( - toolName: string, - args: unknown, -): string | undefined { - if (!args || typeof args !== "object") return undefined; - const record = args as Record; - - const p = typeof record.path === "string" ? record.path : undefined; - const command = - typeof record.command === "string" ? record.command : undefined; - - if (toolName === "read" && p) { - const offset = - typeof record.offset === "number" ? record.offset : undefined; - const limit = typeof record.limit === "number" ? record.limit : undefined; - if (offset !== undefined && limit !== undefined) { - return `${p}:${offset}-${offset + limit}`; - } - return p; - } - if ((toolName === "edit" || toolName === "write") && p) return p; - if (toolName === "bash" && command) return command; - return p ?? command; -} - async function ensureSessionHeader(params: { sessionFile: string; sessionId: string; @@ -239,7 +200,7 @@ export async function runEmbeddedPiAgent(params: { systemPrompt, model, thinkingLevel, - tools: codingTools, + tools: createClawdisCodingTools(), }, messageTransformer, queueMode: settingsManager.getQueueMode(), diff --git a/src/agents/pi-tools.test.ts b/src/agents/pi-tools.test.ts new file mode 100644 index 000000000..c7ad2a4e5 --- /dev/null +++ b/src/agents/pi-tools.test.ts @@ -0,0 +1,32 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { describe, expect, it } from "vitest"; + +import { createClawdisCodingTools } from "./pi-tools.js"; + +const PNG_1x1 = + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII="; + +describe("createClawdisCodingTools", () => { + it("sniffs mime from bytes when extension lies", async () => { + const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdis-pi-")); + const filePath = path.join(tmpDir, "image.jpg"); // actually PNG bytes + await fs.writeFile(filePath, Buffer.from(PNG_1x1, "base64")); + + const read = createClawdisCodingTools().find((t) => t.name === "read"); + expect(read).toBeTruthy(); + if (!read) throw new Error("read tool missing"); + + const res = await read.execute("toolCallId", { path: filePath }); + const image = res.content.find( + (b): b is { type: "image"; mimeType: string } => + !!b && + typeof b === "object" && + (b as Record).type === "image" && + typeof (b as Record).mimeType === "string", + ); + + expect(image?.mimeType).toBe("image/png"); + }); +}); diff --git a/src/agents/pi-tools.ts b/src/agents/pi-tools.ts new file mode 100644 index 000000000..ecc0ae054 --- /dev/null +++ b/src/agents/pi-tools.ts @@ -0,0 +1,127 @@ +import type { AgentTool } from "@mariozechner/pi-ai"; +import { codingTools, readTool } from "@mariozechner/pi-coding-agent"; + +import { detectMime } from "../media/mime.js"; + +type ImageContentBlock = { + type: "image"; + data: string; + mimeType: string; +}; + +type TextContentBlock = { + type: "text"; + text: string; +}; + +type ToolResult = { + content: Array< + ImageContentBlock | TextContentBlock | Record + >; + details?: unknown; +}; + +function sniffMimeFromBase64(base64: string): string | undefined { + const trimmed = base64.trim(); + if (!trimmed) return undefined; + + const take = Math.min(256, trimmed.length); + const sliceLen = take - (take % 4); + if (sliceLen < 8) return undefined; + + try { + const head = Buffer.from(trimmed.slice(0, sliceLen), "base64"); + return detectMime({ buffer: head }); + } catch { + return undefined; + } +} + +function rewriteReadImageHeader(text: string, mimeType: string): string { + // pi-coding-agent uses: "Read image file [image/png]" + if (text.startsWith("Read image file [") && text.endsWith("]")) { + return `Read image file [${mimeType}]`; + } + return text; +} + +function normalizeReadImageResult( + result: ToolResult, + filePath: string, +): ToolResult { + const content = Array.isArray(result.content) ? result.content : []; + + const image = content.find( + (b): b is ImageContentBlock => + !!b && + typeof b === "object" && + (b as ImageContentBlock).type === "image" && + typeof (b as ImageContentBlock).data === "string" && + typeof (b as ImageContentBlock).mimeType === "string", + ); + if (!image) return result; + + if (!image.data.trim()) { + throw new Error(`read: image payload is empty (${filePath})`); + } + + const sniffed = sniffMimeFromBase64(image.data); + if (!sniffed) return result; + + if (!sniffed.startsWith("image/")) { + throw new Error( + `read: file looks like ${sniffed} but was treated as ${image.mimeType} (${filePath})`, + ); + } + + if (sniffed === image.mimeType) return result; + + const nextContent = content.map((block) => { + if ( + block && + typeof block === "object" && + (block as ImageContentBlock).type === "image" + ) { + const b = block as ImageContentBlock; + return { ...b, mimeType: sniffed }; + } + if ( + block && + typeof block === "object" && + (block as TextContentBlock).type === "text" && + typeof (block as TextContentBlock).text === "string" + ) { + const b = block as TextContentBlock; + return { ...b, text: rewriteReadImageHeader(b.text, sniffed) }; + } + return block; + }); + + return { ...result, content: nextContent }; +} + +function createClawdisReadTool(base: AgentTool): AgentTool { + return { + ...base, + execute: async (toolCallId, params, signal) => { + const result = (await base.execute( + toolCallId, + params, + signal, + )) as ToolResult; + const record = + params && typeof params === "object" + ? (params as Record) + : undefined; + const filePath = + typeof record?.path === "string" ? String(record.path) : ""; + return normalizeReadImageResult(result, filePath); + }, + }; +} + +export function createClawdisCodingTools(): AgentTool[] { + return codingTools.map((tool) => + tool.name === readTool.name ? createClawdisReadTool(tool) : tool, + ); +}