feat: add configurable web_fetch maxChars cap
parent
6b4b6049b4
commit
d3ba57b7d7
|
|
@ -2039,6 +2039,7 @@ of `every`, keep `HEARTBEAT.md` tiny, and/or choose a cheaper `model`.
|
||||||
- `tools.web.search.cacheTtlMinutes` (default 15)
|
- `tools.web.search.cacheTtlMinutes` (default 15)
|
||||||
- `tools.web.fetch.enabled` (default true)
|
- `tools.web.fetch.enabled` (default true)
|
||||||
- `tools.web.fetch.maxChars` (default 50000)
|
- `tools.web.fetch.maxChars` (default 50000)
|
||||||
|
- `tools.web.fetch.maxCharsCap` (default 50000; clamps maxChars from config/tool calls)
|
||||||
- `tools.web.fetch.timeoutSeconds` (default 30)
|
- `tools.web.fetch.timeoutSeconds` (default 30)
|
||||||
- `tools.web.fetch.cacheTtlMinutes` (default 15)
|
- `tools.web.fetch.cacheTtlMinutes` (default 15)
|
||||||
- `tools.web.fetch.userAgent` (optional override)
|
- `tools.web.fetch.userAgent` (optional override)
|
||||||
|
|
|
||||||
|
|
@ -252,6 +252,7 @@ Core parameters:
|
||||||
Notes:
|
Notes:
|
||||||
|
|
||||||
- Enable via `tools.web.fetch.enabled`.
|
- Enable via `tools.web.fetch.enabled`.
|
||||||
|
- `maxChars` is clamped by `tools.web.fetch.maxCharsCap` (default 50000).
|
||||||
- Responses are cached (default 15 min).
|
- Responses are cached (default 15 min).
|
||||||
- For JS-heavy sites, prefer the browser tool.
|
- For JS-heavy sites, prefer the browser tool.
|
||||||
- See [Web tools](/tools/web) for setup.
|
- See [Web tools](/tools/web) for setup.
|
||||||
|
|
|
||||||
|
|
@ -221,6 +221,7 @@ Fetch a URL and extract readable content.
|
||||||
fetch: {
|
fetch: {
|
||||||
enabled: true,
|
enabled: true,
|
||||||
maxChars: 50000,
|
maxChars: 50000,
|
||||||
|
maxCharsCap: 50000,
|
||||||
timeoutSeconds: 30,
|
timeoutSeconds: 30,
|
||||||
cacheTtlMinutes: 15,
|
cacheTtlMinutes: 15,
|
||||||
maxRedirects: 3,
|
maxRedirects: 3,
|
||||||
|
|
@ -252,6 +253,7 @@ Notes:
|
||||||
- Firecrawl requests use bot-circumvention mode and cache results by default.
|
- Firecrawl requests use bot-circumvention mode and cache results by default.
|
||||||
- `web_fetch` sends a Chrome-like User-Agent and `Accept-Language` by default; override `userAgent` if needed.
|
- `web_fetch` sends a Chrome-like User-Agent and `Accept-Language` by default; override `userAgent` if needed.
|
||||||
- `web_fetch` blocks private/internal hostnames and re-checks redirects (limit with `maxRedirects`).
|
- `web_fetch` blocks private/internal hostnames and re-checks redirects (limit with `maxRedirects`).
|
||||||
|
- `maxChars` is clamped to `tools.web.fetch.maxCharsCap`.
|
||||||
- `web_fetch` is best-effort extraction; some sites will need the browser tool.
|
- `web_fetch` is best-effort extraction; some sites will need the browser tool.
|
||||||
- See [Firecrawl](/tools/firecrawl) for key setup and service details.
|
- See [Firecrawl](/tools/firecrawl) for key setup and service details.
|
||||||
- Responses are cached (default 15 minutes) to reduce repeated fetches.
|
- Responses are cached (default 15 minutes) to reduce repeated fetches.
|
||||||
|
|
|
||||||
|
|
@ -95,6 +95,17 @@ function resolveFetchReadabilityEnabled(fetch?: WebFetchConfig): boolean {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function resolveFetchMaxCharsCap(fetch?: WebFetchConfig): number {
|
||||||
|
const raw =
|
||||||
|
fetch && "maxCharsCap" in fetch && typeof fetch.maxCharsCap === "number"
|
||||||
|
? fetch.maxCharsCap
|
||||||
|
: undefined;
|
||||||
|
if (typeof raw !== "number" || !Number.isFinite(raw)) {
|
||||||
|
return DEFAULT_FETCH_MAX_CHARS;
|
||||||
|
}
|
||||||
|
return Math.max(100, Math.floor(raw));
|
||||||
|
}
|
||||||
|
|
||||||
function resolveFirecrawlConfig(fetch?: WebFetchConfig): FirecrawlFetchConfig {
|
function resolveFirecrawlConfig(fetch?: WebFetchConfig): FirecrawlFetchConfig {
|
||||||
if (!fetch || typeof fetch !== "object") {
|
if (!fetch || typeof fetch !== "object") {
|
||||||
return undefined;
|
return undefined;
|
||||||
|
|
@ -160,9 +171,10 @@ function resolveFirecrawlMaxAgeMsOrDefault(firecrawl?: FirecrawlFetchConfig): nu
|
||||||
return DEFAULT_FIRECRAWL_MAX_AGE_MS;
|
return DEFAULT_FIRECRAWL_MAX_AGE_MS;
|
||||||
}
|
}
|
||||||
|
|
||||||
function resolveMaxChars(value: unknown, fallback: number): number {
|
function resolveMaxChars(value: unknown, fallback: number, cap: number): number {
|
||||||
const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback;
|
const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback;
|
||||||
return Math.max(100, Math.floor(parsed));
|
const clamped = Math.max(100, Math.floor(parsed));
|
||||||
|
return Math.min(clamped, cap);
|
||||||
}
|
}
|
||||||
|
|
||||||
function resolveMaxRedirects(value: unknown, fallback: number): number {
|
function resolveMaxRedirects(value: unknown, fallback: number): number {
|
||||||
|
|
@ -647,10 +659,15 @@ export function createWebFetchTool(options?: {
|
||||||
const url = readStringParam(params, "url", { required: true });
|
const url = readStringParam(params, "url", { required: true });
|
||||||
const extractMode = readStringParam(params, "extractMode") === "text" ? "text" : "markdown";
|
const extractMode = readStringParam(params, "extractMode") === "text" ? "text" : "markdown";
|
||||||
const maxChars = readNumberParam(params, "maxChars", { integer: true });
|
const maxChars = readNumberParam(params, "maxChars", { integer: true });
|
||||||
|
const maxCharsCap = resolveFetchMaxCharsCap(fetch);
|
||||||
const result = await runWebFetch({
|
const result = await runWebFetch({
|
||||||
url,
|
url,
|
||||||
extractMode,
|
extractMode,
|
||||||
maxChars: resolveMaxChars(maxChars ?? fetch?.maxChars, DEFAULT_FETCH_MAX_CHARS),
|
maxChars: resolveMaxChars(
|
||||||
|
maxChars ?? fetch?.maxChars,
|
||||||
|
DEFAULT_FETCH_MAX_CHARS,
|
||||||
|
maxCharsCap,
|
||||||
|
),
|
||||||
maxRedirects: resolveMaxRedirects(fetch?.maxRedirects, DEFAULT_FETCH_MAX_REDIRECTS),
|
maxRedirects: resolveMaxRedirects(fetch?.maxRedirects, DEFAULT_FETCH_MAX_REDIRECTS),
|
||||||
timeoutSeconds: resolveTimeoutSeconds(fetch?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS),
|
timeoutSeconds: resolveTimeoutSeconds(fetch?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS),
|
||||||
cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES),
|
cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES),
|
||||||
|
|
|
||||||
|
|
@ -49,6 +49,20 @@ function firecrawlError(): MockResponse {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function textResponse(
|
||||||
|
text: string,
|
||||||
|
url = "https://example.com/",
|
||||||
|
contentType = "text/plain; charset=utf-8",
|
||||||
|
): MockResponse {
|
||||||
|
return {
|
||||||
|
ok: true,
|
||||||
|
status: 200,
|
||||||
|
url,
|
||||||
|
headers: makeHeaders({ "content-type": contentType }),
|
||||||
|
text: async () => text,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
function errorHtmlResponse(
|
function errorHtmlResponse(
|
||||||
html: string,
|
html: string,
|
||||||
status = 404,
|
status = 404,
|
||||||
|
|
@ -322,6 +336,37 @@ describe("web_fetch extraction fallbacks", () => {
|
||||||
expect(details.extractor).toBe("firecrawl");
|
expect(details.extractor).toBe("firecrawl");
|
||||||
expect(details.text).toContain("firecrawl fallback");
|
expect(details.text).toContain("firecrawl fallback");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("wraps external content and clamps oversized maxChars", async () => {
|
||||||
|
const large = "a".repeat(80_000);
|
||||||
|
const mockFetch = vi.fn(
|
||||||
|
(input: RequestInfo) =>
|
||||||
|
Promise.resolve(textResponse(large, requestUrl(input))) as Promise<Response>,
|
||||||
|
);
|
||||||
|
// @ts-expect-error mock fetch
|
||||||
|
global.fetch = mockFetch;
|
||||||
|
|
||||||
|
const tool = createWebFetchTool({
|
||||||
|
config: {
|
||||||
|
tools: {
|
||||||
|
web: {
|
||||||
|
fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false }, maxCharsCap: 10_000 },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
sandboxed: false,
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await tool?.execute?.("call", {
|
||||||
|
url: "https://example.com/large",
|
||||||
|
maxChars: 200_000,
|
||||||
|
});
|
||||||
|
const details = result?.details as { text?: string; length?: number; truncated?: boolean };
|
||||||
|
expect(details.text).toContain("<<<EXTERNAL_UNTRUSTED_CONTENT>>>");
|
||||||
|
expect(details.text).toContain("Source: Web Fetch");
|
||||||
|
expect(details.length).toBeLessThanOrEqual(10_000);
|
||||||
|
expect(details.truncated).toBe(true);
|
||||||
|
});
|
||||||
it("strips and truncates HTML from error responses", async () => {
|
it("strips and truncates HTML from error responses", async () => {
|
||||||
const long = "x".repeat(12_000);
|
const long = "x".repeat(12_000);
|
||||||
const html =
|
const html =
|
||||||
|
|
|
||||||
|
|
@ -483,6 +483,8 @@ const FIELD_HELP: Record<string, string> = {
|
||||||
'Perplexity model override (default: "perplexity/sonar-pro").',
|
'Perplexity model override (default: "perplexity/sonar-pro").',
|
||||||
"tools.web.fetch.enabled": "Enable the web_fetch tool (lightweight HTTP fetch).",
|
"tools.web.fetch.enabled": "Enable the web_fetch tool (lightweight HTTP fetch).",
|
||||||
"tools.web.fetch.maxChars": "Max characters returned by web_fetch (truncated).",
|
"tools.web.fetch.maxChars": "Max characters returned by web_fetch (truncated).",
|
||||||
|
"tools.web.fetch.maxCharsCap":
|
||||||
|
"Hard cap for web_fetch maxChars (applies to config and tool calls).",
|
||||||
"tools.web.fetch.timeoutSeconds": "Timeout in seconds for web_fetch requests.",
|
"tools.web.fetch.timeoutSeconds": "Timeout in seconds for web_fetch requests.",
|
||||||
"tools.web.fetch.cacheTtlMinutes": "Cache TTL in minutes for web_fetch results.",
|
"tools.web.fetch.cacheTtlMinutes": "Cache TTL in minutes for web_fetch results.",
|
||||||
"tools.web.fetch.maxRedirects": "Maximum redirects allowed for web_fetch (default: 3).",
|
"tools.web.fetch.maxRedirects": "Maximum redirects allowed for web_fetch (default: 3).",
|
||||||
|
|
|
||||||
|
|
@ -361,6 +361,8 @@ export type ToolsConfig = {
|
||||||
enabled?: boolean;
|
enabled?: boolean;
|
||||||
/** Max characters to return from fetched content. */
|
/** Max characters to return from fetched content. */
|
||||||
maxChars?: number;
|
maxChars?: number;
|
||||||
|
/** Hard cap for maxChars (tool or config), defaults to 50000. */
|
||||||
|
maxCharsCap?: number;
|
||||||
/** Timeout in seconds for fetch requests. */
|
/** Timeout in seconds for fetch requests. */
|
||||||
timeoutSeconds?: number;
|
timeoutSeconds?: number;
|
||||||
/** Cache TTL in minutes for fetched content. */
|
/** Cache TTL in minutes for fetched content. */
|
||||||
|
|
|
||||||
|
|
@ -191,6 +191,7 @@ export const ToolsWebFetchSchema = z
|
||||||
.object({
|
.object({
|
||||||
enabled: z.boolean().optional(),
|
enabled: z.boolean().optional(),
|
||||||
maxChars: z.number().int().positive().optional(),
|
maxChars: z.number().int().positive().optional(),
|
||||||
|
maxCharsCap: z.number().int().positive().optional(),
|
||||||
timeoutSeconds: z.number().int().positive().optional(),
|
timeoutSeconds: z.number().int().positive().optional(),
|
||||||
cacheTtlMinutes: z.number().nonnegative().optional(),
|
cacheTtlMinutes: z.number().nonnegative().optional(),
|
||||||
maxRedirects: z.number().int().nonnegative().optional(),
|
maxRedirects: z.number().int().nonnegative().optional(),
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue