feat: support audio/video/doc media caps and transcript context

main
Peter Steinberger 2025-11-25 23:21:35 +01:00
parent 5dced02a20
commit e0425ad3e1
5 changed files with 147 additions and 30 deletions

View File

@ -4,7 +4,7 @@ import { z } from "zod";
// Preferred binary name for Claude CLI invocations. // Preferred binary name for Claude CLI invocations.
export const CLAUDE_BIN = "claude"; export const CLAUDE_BIN = "claude";
export const CLAUDE_IDENTITY_PREFIX = export const CLAUDE_IDENTITY_PREFIX =
"You are Clawd (Claude) running on the user's Mac via warelay. Your scratchpad is /Users/steipete/clawd; this is your folder and you can add what you like in markdown files and/or images. You don't need to be concise, but WhatsApp replies must stay under ~1500 characters."; "You are Clawd (Claude) running on the user's Mac via warelay. Your scratchpad is /Users/steipete/clawd; this is your folder and you can add what you like in markdown files and/or images. You don't need to be concise, but WhatsApp replies must stay under ~1500 characters. Media you can send: images ≤6MB, audio/video ≤16MB, documents ≤100MB. The prompt may include a media path and an optional Transcript: section—use them when present.";
function extractClaudeText(payload: unknown): string | undefined { function extractClaudeText(payload: unknown): string | undefined {
// Best-effort walker to find the primary text field in Claude JSON outputs. // Best-effort walker to find the primary text field in Claude JSON outputs.

View File

@ -163,8 +163,8 @@ describe("config and templating", () => {
expect(prompt).toContain("/tmp/voice.ogg"); expect(prompt).toContain("/tmp/voice.ogg");
expect(prompt).toContain("Transcript:"); expect(prompt).toContain("Transcript:");
expect(prompt).toContain("voice transcript"); expect(prompt).toContain("voice transcript");
expect(result?.text).toBeUndefined(); expect(result?.text).toBe("ok");
}); });
it("getReplyFromConfig skips transcription when not configured", async () => { it("getReplyFromConfig skips transcription when not configured", async () => {
const cfg = { const cfg = {

31
src/media/constants.ts Normal file
View File

@ -0,0 +1,31 @@
export const MAX_IMAGE_BYTES = 6 * 1024 * 1024; // 6MB
export const MAX_AUDIO_BYTES = 16 * 1024 * 1024; // 16MB
export const MAX_VIDEO_BYTES = 16 * 1024 * 1024; // 16MB
export const MAX_DOCUMENT_BYTES = 100 * 1024 * 1024; // 100MB
export type MediaKind = "image" | "audio" | "video" | "document" | "unknown";
export function mediaKindFromMime(mime?: string | null): MediaKind {
if (!mime) return "unknown";
if (mime.startsWith("image/")) return "image";
if (mime.startsWith("audio/")) return "audio";
if (mime.startsWith("video/")) return "video";
if (mime === "application/pdf") return "document";
if (mime.startsWith("application/")) return "document";
return "unknown";
}
export function maxBytesForKind(kind: MediaKind): number {
switch (kind) {
case "image":
return MAX_IMAGE_BYTES;
case "audio":
return MAX_AUDIO_BYTES;
case "video":
return MAX_VIDEO_BYTES;
case "document":
return MAX_DOCUMENT_BYTES;
default:
return MAX_DOCUMENT_BYTES;
}
}

View File

@ -719,8 +719,16 @@ describe("provider-web", () => {
sendMedia, sendMedia,
}); });
expect(sendMedia).not.toHaveBeenCalled(); expect(sendMedia).toHaveBeenCalledTimes(1);
expect(reply).toHaveBeenCalledWith("hi"); const payload = sendMedia.mock.calls[0][0] as {
document?: Buffer;
caption?: string;
fileName?: string;
};
expect(payload.document).toBeInstanceOf(Buffer);
expect(payload.fileName).toBe("file.pdf");
expect(payload.caption).toBe("hi");
expect(reply).not.toHaveBeenCalled();
fetchMock.mockRestore(); fetchMock.mockRestore();
}); });

View File

@ -21,6 +21,7 @@ import { loadConfig } from "./config/config.js";
import { danger, info, isVerbose, logVerbose, success } from "./globals.js"; import { danger, info, isVerbose, logVerbose, success } from "./globals.js";
import { logInfo } from "./logger.js"; import { logInfo } from "./logger.js";
import { getChildLogger } from "./logging.js"; import { getChildLogger } from "./logging.js";
import { maxBytesForKind, mediaKindFromMime } from "./media/constants.js";
import { saveMediaBuffer } from "./media/store.js"; import { saveMediaBuffer } from "./media/store.js";
import { defaultRuntime, type RuntimeEnv } from "./runtime.js"; import { defaultRuntime, type RuntimeEnv } from "./runtime.js";
import type { Provider } from "./utils.js"; import type { Provider } from "./utils.js";
@ -485,12 +486,39 @@ export async function monitorWebProvider(
logVerbose( logVerbose(
`Web auto-reply media size: ${(media.buffer.length / (1024 * 1024)).toFixed(2)}MB`, `Web auto-reply media size: ${(media.buffer.length / (1024 * 1024)).toFixed(2)}MB`,
); );
logVerbose(
`Web auto-reply media source: ${replyResult.mediaUrl} (kind ${media.kind})`,
);
} }
if (media.kind === "image") {
await msg.sendMedia({ await msg.sendMedia({
image: media.buffer, image: media.buffer,
caption: replyResult.text || undefined, caption: replyResult.text || undefined,
mimetype: media.contentType, mimetype: media.contentType,
}); });
} else if (media.kind === "audio") {
await msg.sendMedia({
audio: media.buffer,
ptt: true,
mimetype: media.contentType,
caption: replyResult.text || undefined,
} as AnyMessageContent);
} else if (media.kind === "video") {
await msg.sendMedia({
video: media.buffer,
caption: replyResult.text || undefined,
mimetype: media.contentType,
});
} else {
const fileName =
replyResult.mediaUrl.split("/").pop() ?? "file";
await msg.sendMedia({
document: media.buffer,
fileName,
caption: replyResult.text || undefined,
mimetype: media.contentType,
} as AnyMessageContent);
}
logInfo( logInfo(
`✅ Sent web media reply to ${msg.from} (${(media.buffer.length / (1024 * 1024)).toFixed(2)}MB)`, `✅ Sent web media reply to ${msg.from} (${(media.buffer.length / (1024 * 1024)).toFixed(2)}MB)`,
runtime, runtime,
@ -502,6 +530,7 @@ export async function monitorWebProvider(
text: replyResult.text ?? null, text: replyResult.text ?? null,
mediaUrl: replyResult.mediaUrl, mediaUrl: replyResult.mediaUrl,
mediaSizeBytes: media.buffer.length, mediaSizeBytes: media.buffer.length,
mediaKind: media.kind,
durationMs: Date.now() - replyStarted, durationMs: Date.now() - replyStarted,
}, },
"auto-reply sent (media)", "auto-reply sent (media)",
@ -727,22 +756,21 @@ async function downloadInboundMedia(
async function loadWebMedia( async function loadWebMedia(
mediaUrl: string, mediaUrl: string,
maxBytes: number = DEFAULT_WEB_MEDIA_BYTES, maxBytes?: number,
): Promise<{ buffer: Buffer; contentType?: string }> { ): Promise<{ buffer: Buffer; contentType?: string; kind: MediaKind }> {
// Hard cap to avoid Anthropic/WhatsApp 5MB image limit that triggers API 400s.
if (mediaUrl.startsWith("file://")) { if (mediaUrl.startsWith("file://")) {
mediaUrl = mediaUrl.replace("file://", ""); mediaUrl = mediaUrl.replace("file://", "");
} }
const optimizeAndClamp = async (buffer: Buffer) => { const optimizeAndClampImage = async (buffer: Buffer, cap: number) => {
const originalSize = buffer.length; const originalSize = buffer.length;
const optimized = await optimizeImageToJpeg(buffer, maxBytes); const optimized = await optimizeImageToJpeg(buffer, cap);
if (optimized.optimizedSize < originalSize && isVerbose()) { if (optimized.optimizedSize < originalSize && isVerbose()) {
logVerbose( logVerbose(
`Optimized media from ${(originalSize / (1024 * 1024)).toFixed(2)}MB to ${(optimized.optimizedSize / (1024 * 1024)).toFixed(2)}MB (side≤${optimized.resizeSide}px, q=${optimized.quality})`, `Optimized media from ${(originalSize / (1024 * 1024)).toFixed(2)}MB to ${(optimized.optimizedSize / (1024 * 1024)).toFixed(2)}MB (side≤${optimized.resizeSide}px, q=${optimized.quality})`,
); );
} }
if (optimized.buffer.length > maxBytes) { if (optimized.buffer.length > cap) {
throw new Error( throw new Error(
`Media could not be reduced below ${(maxBytes / (1024 * 1024)).toFixed(0)}MB (got ${( `Media could not be reduced below ${(maxBytes / (1024 * 1024)).toFixed(0)}MB (got ${(
optimized.buffer.length / (1024 * 1024) optimized.buffer.length / (1024 * 1024)
@ -752,6 +780,7 @@ async function loadWebMedia(
return { return {
buffer: optimized.buffer, buffer: optimized.buffer,
contentType: "image/jpeg", contentType: "image/jpeg",
kind: "image" as const,
}; };
}; };
@ -761,11 +790,60 @@ async function loadWebMedia(
throw new Error(`Failed to fetch media: HTTP ${res.status}`); throw new Error(`Failed to fetch media: HTTP ${res.status}`);
} }
const array = Buffer.from(await res.arrayBuffer()); const array = Buffer.from(await res.arrayBuffer());
return optimizeAndClamp(array); const contentType = res.headers.get("content-type");
const kind = mediaKindFromMime(contentType);
const cap = Math.min(
maxBytes ?? maxBytesForKind(kind),
maxBytesForKind(kind),
);
if (kind === "image") {
return optimizeAndClampImage(array, cap);
}
if (array.length > cap) {
throw new Error(
`Media exceeds ${(cap / (1024 * 1024)).toFixed(0)}MB limit (got ${(
array.length / (1024 * 1024)
).toFixed(2)}MB)`,
);
}
return { buffer: array, contentType: contentType ?? undefined, kind };
} }
// Local path // Local path
const data = await fs.readFile(mediaUrl); const data = await fs.readFile(mediaUrl);
return optimizeAndClamp(data); const ext = path.extname(mediaUrl);
const mime =
(ext &&
(
{
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".png": "image/png",
".webp": "image/webp",
".gif": "image/gif",
".ogg": "audio/ogg",
".opus": "audio/ogg",
".mp3": "audio/mpeg",
".mp4": "video/mp4",
".pdf": "application/pdf",
} as Record<string, string | undefined>
)[ext.toLowerCase()]) ??
undefined;
const kind = mediaKindFromMime(mime);
const cap = Math.min(
maxBytes ?? maxBytesForKind(kind),
maxBytesForKind(kind),
);
if (kind === "image") {
return optimizeAndClampImage(data, cap);
}
if (data.length > cap) {
throw new Error(
`Media exceeds ${(cap / (1024 * 1024)).toFixed(0)}MB limit (got ${(
data.length / (1024 * 1024)
).toFixed(2)}MB)`,
);
}
return { buffer: data, contentType: mime, kind };
} }
function getStatusCode(err: unknown) { function getStatusCode(err: unknown) {