feat!(mac): move screenshot to ui

main
Peter Steinberger 2025-12-13 11:51:51 +00:00
parent cf90bd9c86
commit 36f21c5a4f
7 changed files with 129 additions and 75 deletions

View File

@ -58,20 +58,53 @@ enum ControlRequestHandler {
let result = await AgentRPC.shared.status()
return Response(ok: result.ok, message: result.error)
case let .screenshot(displayID, windowID, _):
let authorized = await PermissionManager
.ensure([.screenRecording], interactive: false)[.screenRecording] ?? false
guard authorized else { return Response(ok: false, message: "screen recording permission missing") }
if let data = await Screenshotter.capture(displayID: displayID, windowID: windowID) {
return Response(ok: true, payload: data)
}
return Response(ok: false, message: "screenshot failed")
case .uiListScreens:
let screens = await MainActor.run { UIScreenService.listScreens() }
let payload = try JSONEncoder().encode(screens)
return Response(ok: true, payload: payload)
case let .uiScreenshot(screenIndex, windowID):
let authorized = await PermissionManager
.ensure([.screenRecording], interactive: false)[.screenRecording] ?? false
guard authorized else { return Response(ok: false, message: "screen recording permission missing") }
let resolution: (screenIndex: Int?, displayID: UInt32?) = await Task { @MainActor in
if let screenIndex,
let match = UIScreenService.listScreens().first(where: { $0.index == screenIndex })
{
return (screenIndex, match.displayID)
}
return (nil, nil)
}.value
let data = await Task { @MainActor in
await Screenshotter.capture(displayID: resolution.displayID, windowID: windowID)
}.value
guard let data else {
return Response(ok: false, message: "screenshot failed")
}
let dir = FileManager.default.temporaryDirectory.appendingPathComponent("clawdis-ui", isDirectory: true)
try? FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true)
let outURL = dir.appendingPathComponent("screenshot-\(Int(Date().timeIntervalSince1970 * 1000)).png")
do {
try data.write(to: outURL)
} catch {
return Response(ok: false, message: "failed to write screenshot: \(error.localizedDescription)")
}
let size = ScreenshotSize.readPNGSize(data: data)
let result = UIScreenshotResult(
path: outURL.path,
width: size?.width ?? 0,
height: size?.height ?? 0,
screenIndex: resolution.screenIndex,
displayID: resolution.displayID,
windowID: windowID)
let payload = try JSONEncoder().encode(result)
return Response(ok: true, payload: payload)
case let .runShell(command, cwd, env, timeoutSec, needsSR):
if needsSR {
let authorized = await PermissionManager

View File

@ -0,0 +1,18 @@
import Foundation
import ImageIO
enum ScreenshotSize {
struct Size {
let width: Int
let height: Int
}
static func readPNGSize(data: Data) -> Size? {
guard let source = CGImageSourceCreateWithData(data as CFData, nil) else { return nil }
guard let props = CGImageSourceCopyPropertiesAtIndex(source, 0, nil) as? [CFString: Any] else { return nil }
guard let width = props[kCGImagePropertyPixelWidth] as? Int else { return nil }
guard let height = props[kCGImagePropertyPixelHeight] as? Int else { return nil }
return Size(width: width, height: height)
}
}

View File

@ -37,8 +37,8 @@ struct ClawdisCLI {
var kind: Kind
enum Kind {
case screenshot(outPath: String?)
case uiScreens
case uiScreenshot
case generic
}
}
@ -95,23 +95,6 @@ struct ClawdisCLI {
if caps.isEmpty { caps = Capability.allCases }
return ParsedCLIRequest(request: .ensurePermissions(caps, interactive: interactive), kind: .generic)
case "screenshot":
var displayID: UInt32?
var windowID: UInt32?
var outPath: String?
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
case "--display-id": if let val = args.popFirst(), let num = UInt32(val) { displayID = num }
case "--window-id": if let val = args.popFirst(), let num = UInt32(val) { windowID = num }
case "--out": outPath = args.popFirst()
default: break
}
}
return ParsedCLIRequest(
request: .screenshot(displayID: displayID, windowID: windowID, format: "png"),
kind: .screenshot(outPath: outPath))
case "ui":
guard let sub = args.first else { throw CLIError.help }
args = Array(args.dropFirst())
@ -119,6 +102,18 @@ struct ClawdisCLI {
switch sub {
case "screens":
return ParsedCLIRequest(request: .uiListScreens, kind: .uiScreens)
case "screenshot":
var screenIndex: Int?
var windowID: UInt32?
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
case "--screen-index": screenIndex = args.popFirst().flatMap(Int.init)
case "--window-id": windowID = args.popFirst().flatMap(UInt32.init)
default: break
}
}
return ParsedCLIRequest(request: .uiScreenshot(screenIndex: screenIndex, windowID: windowID), kind: .uiScreenshot)
default:
throw CLIError.help
}
@ -333,10 +328,6 @@ struct ClawdisCLI {
}
switch parsed.kind {
case let .screenshot(outPath):
let path = try self.writeScreenshotPayloadToFile(payload: response.payload, outPath: outPath)
FileHandle.standardOutput.write(Data((path + "\n").utf8))
case .uiScreens:
let screens = try self.decodePayload([UIScreenInfo].self, payload: response.payload)
if screens.isEmpty {
@ -351,6 +342,10 @@ struct ClawdisCLI {
FileHandle.standardOutput.write(Data(line.utf8))
}
case .uiScreenshot:
let result = try self.decodePayload(UIScreenshotResult.self, payload: response.payload)
FileHandle.standardOutput.write(Data((result.path + "\n").utf8))
case .generic:
if let payload = response.payload, let text = String(data: payload, encoding: .utf8), !text.isEmpty {
FileHandle.standardOutput.write(payload)
@ -370,14 +365,6 @@ struct ClawdisCLI {
]
switch parsed.kind {
case let .screenshot(outPath):
if response.ok {
let path = try self.writeScreenshotPayloadToFile(payload: response.payload, outPath: outPath)
output["result"] = ["path": path]
} else {
output["result"] = NSNull()
}
case .uiScreens:
if let payload = response.payload,
let obj = try? JSONSerialization.jsonObject(with: payload) {
@ -386,6 +373,14 @@ struct ClawdisCLI {
output["result"] = []
}
case .uiScreenshot:
if let payload = response.payload,
let obj = try? JSONSerialization.jsonObject(with: payload) {
output["result"] = obj
} else {
output["result"] = NSNull()
}
case .generic:
if let payload = response.payload, !payload.isEmpty {
if let obj = try? JSONSerialization.jsonObject(with: payload) {
@ -406,21 +401,6 @@ struct ClawdisCLI {
return try JSONDecoder().decode(T.self, from: payload)
}
private static func writeScreenshotPayloadToFile(payload: Data?, outPath: String?) throws -> String {
guard let payload, !payload.isEmpty else { throw POSIXError(.EINVAL) }
let url: URL
if let outPath, !outPath.isEmpty {
url = URL(fileURLWithPath: outPath).resolvingSymlinksInPath()
} else {
let dir = FileManager.default.temporaryDirectory.appendingPathComponent("clawdis-mac", isDirectory: true)
try? FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true)
let name = "screenshot-\(Int(Date().timeIntervalSince1970 * 1000)).png"
url = dir.appendingPathComponent(name)
}
try payload.write(to: url)
return url.path
}
private static func printHelp() {
let usage = """
clawdis-mac talk to the running Clawdis.app XPC service
@ -431,8 +411,8 @@ struct ClawdisCLI {
clawdis-mac ensure-permissions
[--cap <notifications|accessibility|screenRecording|microphone|speechRecognition>]
[--interactive]
clawdis-mac screenshot [--display-id <u32>] [--window-id <u32>] [--out <path>]
clawdis-mac ui screens
clawdis-mac ui screenshot [--screen-index <n>] [--window-id <u32>]
clawdis-mac run [--cwd <path>] [--env KEY=VAL] [--timeout <sec>] [--needs-screen-recording] <command ...>
clawdis-mac status
clawdis-mac rpc-status

View File

@ -83,6 +83,31 @@ public struct UIScreenInfo: Codable, Sendable {
}
}
public struct UIScreenshotResult: Codable, Sendable {
public let path: String
public let width: Int
public let height: Int
public let screenIndex: Int?
public let displayID: UInt32?
public let windowID: UInt32?
public init(
path: String,
width: Int,
height: Int,
screenIndex: Int? = nil,
displayID: UInt32? = nil,
windowID: UInt32? = nil)
{
self.path = path
self.width = width
self.height = height
self.screenIndex = screenIndex
self.displayID = displayID
self.windowID = windowID
}
}
public enum Request: Sendable {
case notify(
title: String,
@ -91,8 +116,8 @@ public enum Request: Sendable {
priority: NotificationPriority?,
delivery: NotificationDelivery?)
case ensurePermissions([Capability], interactive: Bool)
case screenshot(displayID: UInt32?, windowID: UInt32?, format: String)
case uiListScreens
case uiScreenshot(screenIndex: Int?, windowID: UInt32?)
case runShell(
command: [String],
cwd: String?,
@ -133,7 +158,7 @@ extension Request: Codable {
case type
case title, body, sound, priority, delivery
case caps, interactive
case displayID, windowID, format
case screenIndex, windowID
case command, cwd, env, timeoutSec, needsScreenRecording
case message, thinking, session, deliver, to
case rpcStatus
@ -149,8 +174,8 @@ extension Request: Codable {
private enum Kind: String, Codable {
case notify
case ensurePermissions
case screenshot
case uiListScreens
case uiScreenshot
case runShell
case status
case agent
@ -180,15 +205,14 @@ extension Request: Codable {
try container.encode(caps, forKey: .caps)
try container.encode(interactive, forKey: .interactive)
case let .screenshot(displayID, windowID, format):
try container.encode(Kind.screenshot, forKey: .type)
try container.encodeIfPresent(displayID, forKey: .displayID)
try container.encodeIfPresent(windowID, forKey: .windowID)
try container.encode(format, forKey: .format)
case .uiListScreens:
try container.encode(Kind.uiListScreens, forKey: .type)
case let .uiScreenshot(screenIndex, windowID):
try container.encode(Kind.uiScreenshot, forKey: .type)
try container.encodeIfPresent(screenIndex, forKey: .screenIndex)
try container.encodeIfPresent(windowID, forKey: .windowID)
case let .runShell(command, cwd, env, timeoutSec, needsSR):
try container.encode(Kind.runShell, forKey: .type)
try container.encode(command, forKey: .command)
@ -265,15 +289,14 @@ extension Request: Codable {
let interactive = try container.decode(Bool.self, forKey: .interactive)
self = .ensurePermissions(caps, interactive: interactive)
case .screenshot:
let displayID = try container.decodeIfPresent(UInt32.self, forKey: .displayID)
let windowID = try container.decodeIfPresent(UInt32.self, forKey: .windowID)
let format = try container.decode(String.self, forKey: .format)
self = .screenshot(displayID: displayID, windowID: windowID, format: format)
case .uiListScreens:
self = .uiListScreens
case .uiScreenshot:
let screenIndex = try container.decodeIfPresent(Int.self, forKey: .screenIndex)
let windowID = try container.decodeIfPresent(UInt32.self, forKey: .windowID)
self = .uiScreenshot(screenIndex: screenIndex, windowID: windowID)
case .runShell:
let command = try container.decode([String].self, forKey: .command)
let cwd = try container.decodeIfPresent(String.self, forKey: .cwd)

View File

@ -36,7 +36,7 @@ enum Capability { notifications, accessibility, screenRecording, appleScript, mi
enum Request {
notify(title, body, sound?)
ensurePermissions([Capability], interactive: Bool)
screenshot(displayID?, windowID?, format="png")
uiScreenshot(screenIndex?, windowID?)
runShell(command:[String], cwd?, env?, timeoutSec?, needsScreenRecording: Bool)
status
}
@ -66,8 +66,8 @@ struct Response { ok: Bool; message?: String; payload?: Data }
- Subcommands (text by default; `--json` for machine output; non-zero exit on failure):
- `notify --title --body [--sound] [--priority passive|active|timeSensitive] [--delivery system|overlay|auto]`
- `ensure-permissions --cap accessibility --cap screenRecording [--interactive]`
- `screenshot [--display-id N | --window-id N] [--out path]`
- `ui screens`
- `ui screenshot [--screen-index N] [--window-id N]`
- `run -- cmd args... [--cwd] [--env KEY=VAL] [--timeout 30] [--needs-screen-recording]`
- `status`
- Sounds: supply any macOS alert name with `--sound` per notification; omit the flag to use the system default. There is no longer a persisted “default sound” in the app UI.

View File

@ -23,7 +23,7 @@ Run the Node-based Clawdis/clawdis gateway as a direct child of the LSUIElement
## TCC guardrails (must keep)
- Screen Recording, Accessibility, mic, and speech prompts must originate from the Swift app/XPC. The Node child should never call these APIs directly; use the existing XPC/CLI broker (`clawdis-mac`) for:
- `ensure-permissions`
- `screenshot` / ScreenCaptureKit work
- `ui screenshot` / ScreenCaptureKit work
- mic/speech permission checks
- notifications
- shell runs that need `needs-screen-recording`

View File

@ -106,14 +106,14 @@ Current state:
The visualizer is intentionally display-only (no clickable overlays needed).
## Screenshots (legacy → Peekaboo takeover)
Clawdis currently has a legacy `screenshot` request returning raw PNG bytes in `Response.payload`.
Clawdis uses `clawdis-mac ui screenshot` and returns a file path (default location: temp directory) instead of raw image bytes.
Migration plan:
- Replace capture implementation with PeekabooAutomationKits capture service so we share:
- per-screen mapping
- window/app targeting
- visual feedback (flash / watch HUD) when enabled
- Prefer writing images to a file path on the app side and returning the path (text-friendly), with `--json` providing the structured metadata.
- Keep writing images to a file path on the app side and returning the path (text-friendly), with `--json` providing the structured metadata.
- No aliases: remove the old `Request.screenshot` and introduce a new `Request.uiScreenshot` (or similar) so the new behavior is explicit and theres no “legacy mode” to maintain.
## Permissions behavior