fix: skip audio files from text extraction to prevent binary processing (#7475)
* fix: skip audio files from text extraction early Audio files should not be processed through extractFileBlocks for text extraction - they are handled by the dedicated audio transcription capability (STT). Previously, audio files were only skipped if they didn't "look like text" (looksLikeUtf8Text check). This caused issues where some audio binary data (e.g., long Telegram voice messages) could accidentally pass the heuristic check and get processed as text content. This fix: 1. Adds audio to the early skip alongside image/video (more efficient) 2. Removes the redundant secondary check that had the flawed condition Fixes audio binary being incorrectly processed as text in Telegram and other platforms. * Media: skip binary media in file extraction (#7475) (thanks @AlexZhangji) --------- Co-authored-by: Shakker <shakkerdroid@gmail.com>main
parent
966228a6a9
commit
f49297e2c1
|
|
@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai
|
||||||
|
|
||||||
- fix(agents): validate AbortSignal instances before calling AbortSignal.any() (#7277) (thanks @Elarwei001)
|
- fix(agents): validate AbortSignal instances before calling AbortSignal.any() (#7277) (thanks @Elarwei001)
|
||||||
- fix(webchat): respect user scroll position during streaming and refresh (#7226) (thanks @marcomarandiz)
|
- fix(webchat): respect user scroll position during streaming and refresh (#7226) (thanks @marcomarandiz)
|
||||||
|
- Media understanding: skip binary media from file text extraction. (#7475) Thanks @AlexZhangji.
|
||||||
- Security: guard skill installer downloads with SSRF checks (block private/localhost URLs).
|
- Security: guard skill installer downloads with SSRF checks (block private/localhost URLs).
|
||||||
- Media understanding: apply SSRF guardrails to provider fetches; allow private baseUrl overrides explicitly.
|
- Media understanding: apply SSRF guardrails to provider fetches; allow private baseUrl overrides explicitly.
|
||||||
- Tests: stub SSRF DNS pinning in web auto-reply + Gemini video coverage. (#6619) Thanks @joshp123.
|
- Tests: stub SSRF DNS pinning in web auto-reply + Gemini video coverage. (#6619) Thanks @joshp123.
|
||||||
|
|
|
||||||
|
|
@ -528,18 +528,16 @@ describe("applyMediaUnderstanding", () => {
|
||||||
expect(ctx.BodyForCommands).toBe("audio ok");
|
expect(ctx.BodyForCommands).toBe("audio ok");
|
||||||
});
|
});
|
||||||
|
|
||||||
it("treats text-like audio attachments as CSV (comma wins over tabs)", async () => {
|
it("treats text-like attachments as CSV (comma wins over tabs)", async () => {
|
||||||
const { applyMediaUnderstanding } = await loadApply();
|
const { applyMediaUnderstanding } = await loadApply();
|
||||||
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-media-"));
|
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-media-"));
|
||||||
const csvPath = path.join(dir, "data.mp3");
|
const csvPath = path.join(dir, "data.bin");
|
||||||
const csvText = '"a","b"\t"c"\n"1","2"\t"3"';
|
const csvText = '"a","b"\t"c"\n"1","2"\t"3"';
|
||||||
const csvBuffer = Buffer.concat([Buffer.from([0xff, 0xfe]), Buffer.from(csvText, "utf16le")]);
|
await fs.writeFile(csvPath, csvText);
|
||||||
await fs.writeFile(csvPath, csvBuffer);
|
|
||||||
|
|
||||||
const ctx: MsgContext = {
|
const ctx: MsgContext = {
|
||||||
Body: "<media:audio>",
|
Body: "<media:file>",
|
||||||
MediaPath: csvPath,
|
MediaPath: csvPath,
|
||||||
MediaType: "audio/mpeg",
|
|
||||||
};
|
};
|
||||||
const cfg: OpenClawConfig = {
|
const cfg: OpenClawConfig = {
|
||||||
tools: {
|
tools: {
|
||||||
|
|
@ -554,21 +552,20 @@ describe("applyMediaUnderstanding", () => {
|
||||||
const result = await applyMediaUnderstanding({ ctx, cfg });
|
const result = await applyMediaUnderstanding({ ctx, cfg });
|
||||||
|
|
||||||
expect(result.appliedFile).toBe(true);
|
expect(result.appliedFile).toBe(true);
|
||||||
expect(ctx.Body).toContain('<file name="data.mp3" mime="text/csv">');
|
expect(ctx.Body).toContain('<file name="data.bin" mime="text/csv">');
|
||||||
expect(ctx.Body).toContain('"a","b"\t"c"');
|
expect(ctx.Body).toContain('"a","b"\t"c"');
|
||||||
});
|
});
|
||||||
|
|
||||||
it("infers TSV when tabs are present without commas", async () => {
|
it("infers TSV when tabs are present without commas", async () => {
|
||||||
const { applyMediaUnderstanding } = await loadApply();
|
const { applyMediaUnderstanding } = await loadApply();
|
||||||
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-media-"));
|
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-media-"));
|
||||||
const tsvPath = path.join(dir, "report.mp3");
|
const tsvPath = path.join(dir, "report.bin");
|
||||||
const tsvText = "a\tb\tc\n1\t2\t3";
|
const tsvText = "a\tb\tc\n1\t2\t3";
|
||||||
await fs.writeFile(tsvPath, tsvText);
|
await fs.writeFile(tsvPath, tsvText);
|
||||||
|
|
||||||
const ctx: MsgContext = {
|
const ctx: MsgContext = {
|
||||||
Body: "<media:audio>",
|
Body: "<media:file>",
|
||||||
MediaPath: tsvPath,
|
MediaPath: tsvPath,
|
||||||
MediaType: "audio/mpeg",
|
|
||||||
};
|
};
|
||||||
const cfg: OpenClawConfig = {
|
const cfg: OpenClawConfig = {
|
||||||
tools: {
|
tools: {
|
||||||
|
|
@ -583,21 +580,20 @@ describe("applyMediaUnderstanding", () => {
|
||||||
const result = await applyMediaUnderstanding({ ctx, cfg });
|
const result = await applyMediaUnderstanding({ ctx, cfg });
|
||||||
|
|
||||||
expect(result.appliedFile).toBe(true);
|
expect(result.appliedFile).toBe(true);
|
||||||
expect(ctx.Body).toContain('<file name="report.mp3" mime="text/tab-separated-values">');
|
expect(ctx.Body).toContain('<file name="report.bin" mime="text/tab-separated-values">');
|
||||||
expect(ctx.Body).toContain("a\tb\tc");
|
expect(ctx.Body).toContain("a\tb\tc");
|
||||||
});
|
});
|
||||||
|
|
||||||
it("treats cp1252-like audio attachments as text", async () => {
|
it("treats cp1252-like attachments as text", async () => {
|
||||||
const { applyMediaUnderstanding } = await loadApply();
|
const { applyMediaUnderstanding } = await loadApply();
|
||||||
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-media-"));
|
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-media-"));
|
||||||
const filePath = path.join(dir, "legacy.mp3");
|
const filePath = path.join(dir, "legacy.bin");
|
||||||
const cp1252Bytes = Buffer.from([0x93, 0x48, 0x69, 0x94, 0x20, 0x54, 0x65, 0x73, 0x74]);
|
const cp1252Bytes = Buffer.from([0x93, 0x48, 0x69, 0x94, 0x20, 0x54, 0x65, 0x73, 0x74]);
|
||||||
await fs.writeFile(filePath, cp1252Bytes);
|
await fs.writeFile(filePath, cp1252Bytes);
|
||||||
|
|
||||||
const ctx: MsgContext = {
|
const ctx: MsgContext = {
|
||||||
Body: "<media:audio>",
|
Body: "<media:file>",
|
||||||
MediaPath: filePath,
|
MediaPath: filePath,
|
||||||
MediaType: "audio/mpeg",
|
|
||||||
};
|
};
|
||||||
const cfg: OpenClawConfig = {
|
const cfg: OpenClawConfig = {
|
||||||
tools: {
|
tools: {
|
||||||
|
|
@ -645,17 +641,16 @@ describe("applyMediaUnderstanding", () => {
|
||||||
expect(ctx.Body).not.toContain("<file");
|
expect(ctx.Body).not.toContain("<file");
|
||||||
});
|
});
|
||||||
|
|
||||||
it("respects configured allowedMimes for text-like audio attachments", async () => {
|
it("respects configured allowedMimes for text-like attachments", async () => {
|
||||||
const { applyMediaUnderstanding } = await loadApply();
|
const { applyMediaUnderstanding } = await loadApply();
|
||||||
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-media-"));
|
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-media-"));
|
||||||
const tsvPath = path.join(dir, "report.mp3");
|
const tsvPath = path.join(dir, "report.bin");
|
||||||
const tsvText = "a\tb\tc\n1\t2\t3";
|
const tsvText = "a\tb\tc\n1\t2\t3";
|
||||||
await fs.writeFile(tsvPath, tsvText);
|
await fs.writeFile(tsvPath, tsvText);
|
||||||
|
|
||||||
const ctx: MsgContext = {
|
const ctx: MsgContext = {
|
||||||
Body: "<media:audio>",
|
Body: "<media:file>",
|
||||||
MediaPath: tsvPath,
|
MediaPath: tsvPath,
|
||||||
MediaType: "audio/mpeg",
|
|
||||||
};
|
};
|
||||||
const cfg: OpenClawConfig = {
|
const cfg: OpenClawConfig = {
|
||||||
gateway: {
|
gateway: {
|
||||||
|
|
@ -679,7 +674,7 @@ describe("applyMediaUnderstanding", () => {
|
||||||
const result = await applyMediaUnderstanding({ ctx, cfg });
|
const result = await applyMediaUnderstanding({ ctx, cfg });
|
||||||
|
|
||||||
expect(result.appliedFile).toBe(false);
|
expect(result.appliedFile).toBe(false);
|
||||||
expect(ctx.Body).toBe("<media:audio>");
|
expect(ctx.Body).toBe("<media:file>");
|
||||||
expect(ctx.Body).not.toContain("<file");
|
expect(ctx.Body).not.toContain("<file");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
@ -740,10 +735,11 @@ describe("applyMediaUnderstanding", () => {
|
||||||
|
|
||||||
const result = await applyMediaUnderstanding({ ctx, cfg });
|
const result = await applyMediaUnderstanding({ ctx, cfg });
|
||||||
|
|
||||||
|
const body = ctx.Body ?? "";
|
||||||
expect(result.appliedFile).toBe(true);
|
expect(result.appliedFile).toBe(true);
|
||||||
expect(ctx.Body).toContain("</file>");
|
expect(body).toContain("</file>");
|
||||||
expect(ctx.Body).toContain("<file");
|
expect(body).toContain("<file");
|
||||||
expect((ctx.Body.match(/<\/file>/g) ?? []).length).toBe(1);
|
expect((body.match(/<\/file>/g) ?? []).length).toBe(1);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("normalizes MIME types to prevent attribute injection", async () => {
|
it("normalizes MIME types to prevent attribute injection", async () => {
|
||||||
|
|
|
||||||
|
|
@ -317,6 +317,13 @@ function resolveTextMimeFromName(name?: string): string | undefined {
|
||||||
return TEXT_EXT_MIME.get(ext);
|
return TEXT_EXT_MIME.get(ext);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function isBinaryMediaMime(mime?: string): boolean {
|
||||||
|
if (!mime) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return mime.startsWith("image/") || mime.startsWith("audio/") || mime.startsWith("video/");
|
||||||
|
}
|
||||||
|
|
||||||
async function extractFileBlocks(params: {
|
async function extractFileBlocks(params: {
|
||||||
attachments: ReturnType<typeof normalizeMediaAttachments>;
|
attachments: ReturnType<typeof normalizeMediaAttachments>;
|
||||||
cache: ReturnType<typeof createMediaAttachmentCache>;
|
cache: ReturnType<typeof createMediaAttachmentCache>;
|
||||||
|
|
@ -337,7 +344,7 @@ async function extractFileBlocks(params: {
|
||||||
}
|
}
|
||||||
const forcedTextMime = resolveTextMimeFromName(attachment.path ?? attachment.url ?? "");
|
const forcedTextMime = resolveTextMimeFromName(attachment.path ?? attachment.url ?? "");
|
||||||
const kind = forcedTextMime ? "document" : resolveAttachmentKind(attachment);
|
const kind = forcedTextMime ? "document" : resolveAttachmentKind(attachment);
|
||||||
if (!forcedTextMime && (kind === "image" || kind === "video")) {
|
if (!forcedTextMime && (kind === "image" || kind === "video" || kind === "audio")) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (!limits.allowUrl && attachment.url && !attachment.path) {
|
if (!limits.allowUrl && attachment.url && !attachment.path) {
|
||||||
|
|
@ -361,16 +368,17 @@ async function extractFileBlocks(params: {
|
||||||
}
|
}
|
||||||
const nameHint = bufferResult?.fileName ?? attachment.path ?? attachment.url;
|
const nameHint = bufferResult?.fileName ?? attachment.path ?? attachment.url;
|
||||||
const forcedTextMimeResolved = forcedTextMime ?? resolveTextMimeFromName(nameHint ?? "");
|
const forcedTextMimeResolved = forcedTextMime ?? resolveTextMimeFromName(nameHint ?? "");
|
||||||
|
const rawMime = bufferResult?.mime ?? attachment.mime;
|
||||||
|
const normalizedRawMime = normalizeMimeType(rawMime);
|
||||||
|
if (!forcedTextMimeResolved && isBinaryMediaMime(normalizedRawMime)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
const utf16Charset = resolveUtf16Charset(bufferResult?.buffer);
|
const utf16Charset = resolveUtf16Charset(bufferResult?.buffer);
|
||||||
const textSample = decodeTextSample(bufferResult?.buffer);
|
const textSample = decodeTextSample(bufferResult?.buffer);
|
||||||
const textLike = Boolean(utf16Charset) || looksLikeUtf8Text(bufferResult?.buffer);
|
const textLike = Boolean(utf16Charset) || looksLikeUtf8Text(bufferResult?.buffer);
|
||||||
if (!forcedTextMimeResolved && kind === "audio" && !textLike) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
const guessedDelimited = textLike ? guessDelimitedMime(textSample) : undefined;
|
const guessedDelimited = textLike ? guessDelimitedMime(textSample) : undefined;
|
||||||
const textHint =
|
const textHint =
|
||||||
forcedTextMimeResolved ?? guessedDelimited ?? (textLike ? "text/plain" : undefined);
|
forcedTextMimeResolved ?? guessedDelimited ?? (textLike ? "text/plain" : undefined);
|
||||||
const rawMime = bufferResult?.mime ?? attachment.mime;
|
|
||||||
const mimeType = sanitizeMimeType(textHint ?? normalizeMimeType(rawMime));
|
const mimeType = sanitizeMimeType(textHint ?? normalizeMimeType(rawMime));
|
||||||
// Log when MIME type is overridden from non-text to text for auditability
|
// Log when MIME type is overridden from non-text to text for auditability
|
||||||
if (textHint && rawMime && !rawMime.startsWith("text/")) {
|
if (textHint && rawMime && !rawMime.startsWith("text/")) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue