Merge origin/main

main
Peter Steinberger 2025-12-14 00:52:40 +00:00
commit 099b8c9fa5
34 changed files with 1862 additions and 81 deletions

View File

@ -13,6 +13,7 @@ final class BridgeConnectionController: ObservableObject {
private weak var appModel: NodeAppModel? private weak var appModel: NodeAppModel?
private var cancellables = Set<AnyCancellable>() private var cancellables = Set<AnyCancellable>()
private var didAutoConnect = false private var didAutoConnect = false
private var seenStableIDs = Set<String>()
init(appModel: NodeAppModel) { init(appModel: NodeAppModel) {
self.appModel = appModel self.appModel = appModel
@ -23,6 +24,7 @@ final class BridgeConnectionController: ObservableObject {
.sink { [weak self] newValue in .sink { [weak self] newValue in
guard let self else { return } guard let self else { return }
self.bridges = newValue self.bridges = newValue
self.updateLastDiscoveredBridge(from: newValue)
self.maybeAutoConnect() self.maybeAutoConnect()
} }
.store(in: &self.cancellables) .store(in: &self.cancellables)
@ -50,9 +52,9 @@ final class BridgeConnectionController: ObservableObject {
guard appModel.bridgeServerName == nil else { return } guard appModel.bridgeServerName == nil else { return }
let defaults = UserDefaults.standard let defaults = UserDefaults.standard
let preferredStableID = defaults.string(forKey: "bridge.preferredStableID")? let targetStableID = defaults.string(forKey: "bridge.lastDiscoveredStableID")?
.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" .trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
guard !preferredStableID.isEmpty else { return } guard !targetStableID.isEmpty else { return }
let instanceId = defaults.string(forKey: "node.instanceId")? let instanceId = defaults.string(forKey: "node.instanceId")?
.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" .trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
@ -64,12 +66,20 @@ final class BridgeConnectionController: ObservableObject {
.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" .trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
guard !token.isEmpty else { return } guard !token.isEmpty else { return }
guard let target = self.bridges.first(where: { $0.stableID == preferredStableID }) else { return } guard let target = self.bridges.first(where: { $0.stableID == targetStableID }) else { return }
self.didAutoConnect = true self.didAutoConnect = true
appModel.connectToBridge(endpoint: target.endpoint, hello: self.makeHello(token: token)) appModel.connectToBridge(endpoint: target.endpoint, hello: self.makeHello(token: token))
} }
private func updateLastDiscoveredBridge(from bridges: [BridgeDiscoveryModel.DiscoveredBridge]) {
let newlyDiscovered = bridges.filter { self.seenStableIDs.insert($0.stableID).inserted }
guard let last = newlyDiscovered.last else { return }
UserDefaults.standard.set(last.stableID, forKey: "bridge.lastDiscoveredStableID")
BridgeSettingsStore.saveLastDiscoveredBridgeStableID(last.stableID)
}
private func makeHello(token: String) -> BridgeHello { private func makeHello(token: String) -> BridgeHello {
let defaults = UserDefaults.standard let defaults = UserDefaults.standard
let nodeId = defaults.string(forKey: "node.instanceId") ?? "ios-node" let nodeId = defaults.string(forKey: "node.instanceId") ?? "ios-node"

View File

@ -6,13 +6,16 @@ enum BridgeSettingsStore {
private static let instanceIdDefaultsKey = "node.instanceId" private static let instanceIdDefaultsKey = "node.instanceId"
private static let preferredBridgeStableIDDefaultsKey = "bridge.preferredStableID" private static let preferredBridgeStableIDDefaultsKey = "bridge.preferredStableID"
private static let lastDiscoveredBridgeStableIDDefaultsKey = "bridge.lastDiscoveredStableID"
private static let instanceIdAccount = "instanceId" private static let instanceIdAccount = "instanceId"
private static let preferredBridgeStableIDAccount = "preferredStableID" private static let preferredBridgeStableIDAccount = "preferredStableID"
private static let lastDiscoveredBridgeStableIDAccount = "lastDiscoveredStableID"
static func bootstrapPersistence() { static func bootstrapPersistence() {
self.ensureStableInstanceID() self.ensureStableInstanceID()
self.ensurePreferredBridgeStableID() self.ensurePreferredBridgeStableID()
self.ensureLastDiscoveredBridgeStableID()
} }
static func loadStableInstanceID() -> String? { static func loadStableInstanceID() -> String? {
@ -36,6 +39,18 @@ enum BridgeSettingsStore {
account: self.preferredBridgeStableIDAccount) account: self.preferredBridgeStableIDAccount)
} }
static func loadLastDiscoveredBridgeStableID() -> String? {
KeychainStore.loadString(service: self.bridgeService, account: self.lastDiscoveredBridgeStableIDAccount)?
.trimmingCharacters(in: .whitespacesAndNewlines)
}
static func saveLastDiscoveredBridgeStableID(_ stableID: String) {
_ = KeychainStore.saveString(
stableID,
service: self.bridgeService,
account: self.lastDiscoveredBridgeStableIDAccount)
}
private static func ensureStableInstanceID() { private static func ensureStableInstanceID() {
let defaults = UserDefaults.standard let defaults = UserDefaults.standard
@ -76,4 +91,22 @@ enum BridgeSettingsStore {
defaults.set(stored, forKey: self.preferredBridgeStableIDDefaultsKey) defaults.set(stored, forKey: self.preferredBridgeStableIDDefaultsKey)
} }
} }
private static func ensureLastDiscoveredBridgeStableID() {
let defaults = UserDefaults.standard
if let existing = defaults.string(forKey: self.lastDiscoveredBridgeStableIDDefaultsKey)?
.trimmingCharacters(in: .whitespacesAndNewlines),
!existing.isEmpty
{
if self.loadLastDiscoveredBridgeStableID() == nil {
self.saveLastDiscoveredBridgeStableID(existing)
}
return
}
if let stored = self.loadLastDiscoveredBridgeStableID(), !stored.isEmpty {
defaults.set(stored, forKey: self.lastDiscoveredBridgeStableIDDefaultsKey)
}
}
} }

View File

@ -0,0 +1,319 @@
import AVFoundation
import ClawdisKit
import Foundation
import UIKit
actor CameraController {
enum CameraError: LocalizedError, Sendable {
case cameraUnavailable
case microphoneUnavailable
case permissionDenied(kind: String)
case invalidParams(String)
case captureFailed(String)
case exportFailed(String)
var errorDescription: String? {
switch self {
case .cameraUnavailable:
"Camera unavailable"
case .microphoneUnavailable:
"Microphone unavailable"
case let .permissionDenied(kind):
"\(kind) permission denied"
case let .invalidParams(msg):
msg
case let .captureFailed(msg):
msg
case let .exportFailed(msg):
msg
}
}
}
func snap(params: ClawdisCameraSnapParams) async throws -> (
format: String,
base64: String,
width: Int,
height: Int)
{
let facing = params.facing ?? .front
let maxWidth = params.maxWidth.flatMap { $0 > 0 ? $0 : nil }
let quality = Self.clampQuality(params.quality)
try await self.ensureAccess(for: .video)
let session = AVCaptureSession()
session.sessionPreset = .photo
guard let device = Self.pickCamera(facing: facing) else {
throw CameraError.cameraUnavailable
}
let input = try AVCaptureDeviceInput(device: device)
guard session.canAddInput(input) else {
throw CameraError.captureFailed("Failed to add camera input")
}
session.addInput(input)
let output = AVCapturePhotoOutput()
guard session.canAddOutput(output) else {
throw CameraError.captureFailed("Failed to add photo output")
}
session.addOutput(output)
output.maxPhotoQualityPrioritization = .quality
session.startRunning()
defer { session.stopRunning() }
let settings: AVCapturePhotoSettings = {
if output.availablePhotoCodecTypes.contains(.jpeg) {
return AVCapturePhotoSettings(format: [AVVideoCodecKey: AVVideoCodecType.jpeg])
}
return AVCapturePhotoSettings()
}()
settings.photoQualityPrioritization = .quality
let rawData: Data = try await withCheckedThrowingContinuation { cont in
output.capturePhoto(with: settings, delegate: PhotoCaptureDelegate(cont))
}
let (finalData, size) = try Self.reencodeJPEG(
imageData: rawData,
maxWidth: maxWidth,
quality: quality)
return (
format: "jpg",
base64: finalData.base64EncodedString(),
width: Int(size.width.rounded()),
height: Int(size.height.rounded()))
}
func clip(params: ClawdisCameraClipParams) async throws -> (
format: String,
base64: String,
durationMs: Int,
hasAudio: Bool)
{
let facing = params.facing ?? .front
let durationMs = Self.clampDurationMs(params.durationMs)
let includeAudio = params.includeAudio ?? true
try await self.ensureAccess(for: .video)
if includeAudio {
try await self.ensureAccess(for: .audio)
}
let session = AVCaptureSession()
session.sessionPreset = .high
guard let camera = Self.pickCamera(facing: facing) else {
throw CameraError.cameraUnavailable
}
let cameraInput = try AVCaptureDeviceInput(device: camera)
guard session.canAddInput(cameraInput) else {
throw CameraError.captureFailed("Failed to add camera input")
}
session.addInput(cameraInput)
if includeAudio {
guard let mic = AVCaptureDevice.default(for: .audio) else {
throw CameraError.microphoneUnavailable
}
let micInput = try AVCaptureDeviceInput(device: mic)
if session.canAddInput(micInput) {
session.addInput(micInput)
} else {
throw CameraError.captureFailed("Failed to add microphone input")
}
}
let output = AVCaptureMovieFileOutput()
guard session.canAddOutput(output) else {
throw CameraError.captureFailed("Failed to add movie output")
}
session.addOutput(output)
output.maxRecordedDuration = CMTime(value: Int64(durationMs), timescale: 1000)
session.startRunning()
defer { session.stopRunning() }
let movURL = FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-\(UUID().uuidString).mov")
let mp4URL = FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-\(UUID().uuidString).mp4")
defer {
try? FileManager.default.removeItem(at: movURL)
try? FileManager.default.removeItem(at: mp4URL)
}
let recordedURL: URL = try await withCheckedThrowingContinuation { cont in
let delegate = MovieFileDelegate(cont)
output.startRecording(to: movURL, recordingDelegate: delegate)
}
// Transcode .mov -> .mp4 for easier downstream handling.
try await Self.exportToMP4(inputURL: recordedURL, outputURL: mp4URL)
let data = try Data(contentsOf: mp4URL)
return (format: "mp4", base64: data.base64EncodedString(), durationMs: durationMs, hasAudio: includeAudio)
}
private func ensureAccess(for mediaType: AVMediaType) async throws {
let status = AVCaptureDevice.authorizationStatus(for: mediaType)
switch status {
case .authorized:
return
case .notDetermined:
let ok = await withCheckedContinuation(isolation: nil) { cont in
AVCaptureDevice.requestAccess(for: mediaType) { granted in
cont.resume(returning: granted)
}
}
if !ok {
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
}
case .denied, .restricted:
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
@unknown default:
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
}
}
private nonisolated static func pickCamera(facing: ClawdisCameraFacing) -> AVCaptureDevice? {
let position: AVCaptureDevice.Position = (facing == .front) ? .front : .back
return AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: position)
}
private nonisolated static func clampQuality(_ quality: Double?) -> Double {
let q = quality ?? 0.9
return min(1.0, max(0.05, q))
}
private nonisolated static func clampDurationMs(_ ms: Int?) -> Int {
let v = ms ?? 3000
// Keep clips short by default; avoid huge base64 payloads on the bridge.
return min(15000, max(250, v))
}
private nonisolated static func reencodeJPEG(
imageData: Data,
maxWidth: Int?,
quality: Double) throws -> (data: Data, size: CGSize)
{
guard let image = UIImage(data: imageData) else {
throw CameraError.captureFailed("Failed to decode captured image")
}
let finalImage: UIImage = if let maxWidth, maxWidth > 0 {
Self.downscale(image: image, maxWidth: CGFloat(maxWidth))
} else {
image
}
guard let out = finalImage.jpegData(compressionQuality: quality) else {
throw CameraError.captureFailed("Failed to encode JPEG")
}
return (out, finalImage.size)
}
private nonisolated static func downscale(image: UIImage, maxWidth: CGFloat) -> UIImage {
let w = image.size.width
let h = image.size.height
guard w > 0, h > 0 else { return image }
guard w > maxWidth else { return image }
let scale = maxWidth / w
let target = CGSize(width: maxWidth, height: max(1, h * scale))
let format = UIGraphicsImageRendererFormat.default()
format.opaque = false
let renderer = UIGraphicsImageRenderer(size: target, format: format)
return renderer.image { _ in
image.draw(in: CGRect(origin: .zero, size: target))
}
}
private nonisolated static func exportToMP4(inputURL: URL, outputURL: URL) async throws {
let asset = AVAsset(url: inputURL)
guard let exporter = AVAssetExportSession(asset: asset, presetName: AVAssetExportPresetHighestQuality) else {
throw CameraError.exportFailed("Failed to create export session")
}
exporter.outputURL = outputURL
exporter.outputFileType = .mp4
exporter.shouldOptimizeForNetworkUse = true
try await withCheckedThrowingContinuation(isolation: nil) { cont in
exporter.exportAsynchronously {
switch exporter.status {
case .completed:
cont.resume(returning: ())
case .failed:
cont.resume(throwing: exporter.error ?? CameraError.exportFailed("Export failed"))
case .cancelled:
cont.resume(throwing: CameraError.exportFailed("Export cancelled"))
default:
cont.resume(throwing: CameraError.exportFailed("Export did not complete"))
}
}
}
}
}
private final class PhotoCaptureDelegate: NSObject, AVCapturePhotoCaptureDelegate {
private let continuation: CheckedContinuation<Data, Error>
private var didResume = false
init(_ continuation: CheckedContinuation<Data, Error>) {
self.continuation = continuation
}
func photoOutput(
_ output: AVCapturePhotoOutput,
didFinishProcessingPhoto photo: AVCapturePhoto,
error: Error?)
{
guard !self.didResume else { return }
self.didResume = true
if let error {
self.continuation.resume(throwing: error)
return
}
guard let data = photo.fileDataRepresentation() else {
self.continuation.resume(
throwing: NSError(domain: "Camera", code: 1, userInfo: [
NSLocalizedDescriptionKey: "photo data missing",
]))
return
}
self.continuation.resume(returning: data)
}
}
private final class MovieFileDelegate: NSObject, AVCaptureFileOutputRecordingDelegate {
private let continuation: CheckedContinuation<URL, Error>
private var didResume = false
init(_ continuation: CheckedContinuation<URL, Error>) {
self.continuation = continuation
}
func fileOutput(
_ output: AVCaptureFileOutput,
didFinishRecordingTo outputFileURL: URL,
from connections: [AVCaptureConnection],
error: Error?)
{
guard !self.didResume else { return }
self.didResume = true
if let error {
self.continuation.resume(throwing: error)
return
}
self.continuation.resume(returning: outputFileURL)
}
}

View File

@ -26,6 +26,8 @@
</array> </array>
<key>NSLocalNetworkUsageDescription</key> <key>NSLocalNetworkUsageDescription</key>
<string>Clawdis discovers and connects to your Clawdis bridge on the local network.</string> <string>Clawdis discovers and connects to your Clawdis bridge on the local network.</string>
<key>NSCameraUsageDescription</key>
<string>Clawdis can capture photos or short video clips when requested via the bridge.</string>
<key>NSMicrophoneUsageDescription</key> <key>NSMicrophoneUsageDescription</key>
<string>Clawdis needs microphone access for voice wake.</string> <string>Clawdis needs microphone access for voice wake.</string>
<key>NSSpeechRecognitionUsageDescription</key> <key>NSSpeechRecognitionUsageDescription</key>

View File

@ -6,6 +6,7 @@ import SwiftUI
final class NodeAppModel: ObservableObject { final class NodeAppModel: ObservableObject {
@Published var isBackgrounded: Bool = false @Published var isBackgrounded: Bool = false
let screen = ScreenController() let screen = ScreenController()
let camera = CameraController()
@Published var bridgeStatusText: String = "Not connected" @Published var bridgeStatusText: String = "Not connected"
@Published var bridgeServerName: String? @Published var bridgeServerName: String?
@Published var bridgeRemoteAddress: String? @Published var bridgeRemoteAddress: String?
@ -182,13 +183,22 @@ final class NodeAppModel: ObservableObject {
} }
private func handleInvoke(_ req: BridgeInvokeRequest) async -> BridgeInvokeResponse { private func handleInvoke(_ req: BridgeInvokeRequest) async -> BridgeInvokeResponse {
if req.command.hasPrefix("screen."), self.isBackgrounded { if req.command.hasPrefix("screen.") || req.command.hasPrefix("camera."), self.isBackgrounded {
return BridgeInvokeResponse( return BridgeInvokeResponse(
id: req.id, id: req.id,
ok: false, ok: false,
error: ClawdisNodeError( error: ClawdisNodeError(
code: .backgroundUnavailable, code: .backgroundUnavailable,
message: "NODE_BACKGROUND_UNAVAILABLE: screen commands require foreground")) message: "NODE_BACKGROUND_UNAVAILABLE: screen/camera commands require foreground"))
}
if req.command.hasPrefix("camera."), !self.isCameraEnabled() {
return BridgeInvokeResponse(
id: req.id,
ok: false,
error: ClawdisNodeError(
code: .unavailable,
message: "CAMERA_DISABLED: enable Camera in iOS Settings → Camera → Allow Camera"))
} }
do { do {
@ -222,6 +232,46 @@ final class NodeAppModel: ObservableObject {
let payload = try Self.encodePayload(["format": "png", "base64": base64]) let payload = try Self.encodePayload(["format": "png", "base64": base64])
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload) return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload)
case ClawdisCameraCommand.snap.rawValue:
let params = (try? Self.decodeParams(ClawdisCameraSnapParams.self, from: req.paramsJSON)) ??
ClawdisCameraSnapParams()
let res = try await self.camera.snap(params: params)
struct Payload: Codable {
var format: String
var base64: String
var width: Int
var height: Int
}
let payload = try Self.encodePayload(Payload(
format: res.format,
base64: res.base64,
width: res.width,
height: res.height))
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload)
case ClawdisCameraCommand.clip.rawValue:
let params = (try? Self.decodeParams(ClawdisCameraClipParams.self, from: req.paramsJSON)) ??
ClawdisCameraClipParams()
let suspended = (params.includeAudio ?? true) ? self.voiceWake.suspendForExternalAudioCapture() : false
defer { self.voiceWake.resumeAfterExternalAudioCapture(wasSuspended: suspended) }
let res = try await self.camera.clip(params: params)
struct Payload: Codable {
var format: String
var base64: String
var durationMs: Int
var hasAudio: Bool
}
let payload = try Self.encodePayload(Payload(
format: res.format,
base64: res.base64,
durationMs: res.durationMs,
hasAudio: res.hasAudio))
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload)
default: default:
return BridgeInvokeResponse( return BridgeInvokeResponse(
id: req.id, id: req.id,
@ -254,4 +304,10 @@ final class NodeAppModel: ObservableObject {
} }
return json return json
} }
private func isCameraEnabled() -> Bool {
// Default-on: if the key doesn't exist yet, treat it as enabled.
if UserDefaults.standard.object(forKey: "camera.enabled") == nil { return true }
return UserDefaults.standard.bool(forKey: "camera.enabled")
}
} }

View File

@ -2,6 +2,7 @@ import SwiftUI
struct RootTabs: View { struct RootTabs: View {
@EnvironmentObject private var appModel: NodeAppModel @EnvironmentObject private var appModel: NodeAppModel
@State private var isConnectingPulse: Bool = false
var body: some View { var body: some View {
TabView { TabView {
@ -27,12 +28,18 @@ struct RootTabs: View {
radius: self.settingsIndicatorGlowRadius, radius: self.settingsIndicatorGlowRadius,
x: 0, x: 0,
y: 0) y: 0)
.scaleEffect(self.settingsIndicatorScale)
.opacity(self.settingsIndicatorOpacity)
.offset(x: 7, y: -2) .offset(x: 7, y: -2)
} }
Text("Settings") Text("Settings")
} }
} }
} }
.onAppear { self.updateConnectingPulse(for: self.bridgeIndicatorState) }
.onChange(of: self.bridgeIndicatorState) { _, newValue in
self.updateConnectingPulse(for: newValue)
}
} }
private enum BridgeIndicatorState { private enum BridgeIndicatorState {
@ -74,9 +81,31 @@ struct RootTabs: View {
case .connected: case .connected:
6 6
case .connecting: case .connecting:
4 self.isConnectingPulse ? 6 : 3
case .disconnected: case .disconnected:
0 0
} }
} }
private var settingsIndicatorScale: CGFloat {
guard self.bridgeIndicatorState == .connecting else { return 1 }
return self.isConnectingPulse ? 1.12 : 0.96
}
private var settingsIndicatorOpacity: Double {
guard self.bridgeIndicatorState == .connecting else { return 1 }
return self.isConnectingPulse ? 1.0 : 0.75
}
private func updateConnectingPulse(for state: BridgeIndicatorState) {
guard state == .connecting else {
withAnimation(.easeOut(duration: 0.2)) { self.isConnectingPulse = false }
return
}
guard !self.isConnectingPulse else { return }
withAnimation(.easeInOut(duration: 0.9).repeatForever(autoreverses: true)) {
self.isConnectingPulse = true
}
}
} }

View File

@ -19,6 +19,7 @@ struct SettingsTab: View {
@AppStorage("voiceWake.enabled") private var voiceWakeEnabled: Bool = false @AppStorage("voiceWake.enabled") private var voiceWakeEnabled: Bool = false
@AppStorage("camera.enabled") private var cameraEnabled: Bool = true @AppStorage("camera.enabled") private var cameraEnabled: Bool = true
@AppStorage("bridge.preferredStableID") private var preferredBridgeStableID: String = "" @AppStorage("bridge.preferredStableID") private var preferredBridgeStableID: String = ""
@AppStorage("bridge.lastDiscoveredStableID") private var lastDiscoveredBridgeStableID: String = ""
@StateObject private var connectStatus = ConnectStatusStore() @StateObject private var connectStatus = ConnectStatusStore()
@State private var connectingBridgeID: String? @State private var connectingBridgeID: String?
@State private var localIPAddress: String? @State private var localIPAddress: String?
@ -207,6 +208,8 @@ struct SettingsTab: View {
self.connectingBridgeID = bridge.id self.connectingBridgeID = bridge.id
self.preferredBridgeStableID = bridge.stableID self.preferredBridgeStableID = bridge.stableID
BridgeSettingsStore.savePreferredBridgeStableID(bridge.stableID) BridgeSettingsStore.savePreferredBridgeStableID(bridge.stableID)
self.lastDiscoveredBridgeStableID = bridge.stableID
BridgeSettingsStore.saveLastDiscoveredBridgeStableID(bridge.stableID)
defer { self.connectingBridgeID = nil } defer { self.connectingBridgeID = nil }
do { do {

View File

@ -205,6 +205,37 @@ final class VoiceWakeManager: NSObject, ObservableObject {
try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation) try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation)
} }
/// Temporarily releases the microphone so other subsystems (e.g. camera video capture) can record audio.
/// Returns `true` when listening was active and was suspended.
func suspendForExternalAudioCapture() -> Bool {
guard self.isEnabled, self.isListening else { return false }
self.isListening = false
self.statusText = "Paused"
self.tapDrainTask?.cancel()
self.tapDrainTask = nil
self.tapQueue?.clear()
self.tapQueue = nil
self.recognitionTask?.cancel()
self.recognitionTask = nil
self.recognitionRequest = nil
if self.audioEngine.isRunning {
self.audioEngine.stop()
self.audioEngine.inputNode.removeTap(onBus: 0)
}
try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation)
return true
}
func resumeAfterExternalAudioCapture(wasSuspended: Bool) {
guard wasSuspended else { return }
Task { await self.start() }
}
private func startRecognition() throws { private func startRecognition() throws {
self.recognitionTask?.cancel() self.recognitionTask?.cancel()
self.recognitionTask = nil self.recognitionTask = nil

View File

@ -54,5 +54,6 @@ targets:
NSLocalNetworkUsageDescription: Clawdis discovers and connects to your Clawdis bridge on the local network. NSLocalNetworkUsageDescription: Clawdis discovers and connects to your Clawdis bridge on the local network.
NSBonjourServices: NSBonjourServices:
- _clawdis-bridge._tcp - _clawdis-bridge._tcp
NSCameraUsageDescription: Clawdis can capture photos or short video clips when requested via the bridge.
NSMicrophoneUsageDescription: Clawdis needs microphone access for voice wake. NSMicrophoneUsageDescription: Clawdis needs microphone access for voice wake.
NSSpeechRecognitionUsageDescription: Clawdis uses on-device speech recognition for voice wake. NSSpeechRecognitionUsageDescription: Clawdis uses on-device speech recognition for voice wake.

View File

@ -0,0 +1,341 @@
import AVFoundation
import ClawdisIPC
import CoreGraphics
import Foundation
import ImageIO
import OSLog
import UniformTypeIdentifiers
actor CameraCaptureService {
enum CameraError: LocalizedError, Sendable {
case cameraUnavailable
case microphoneUnavailable
case permissionDenied(kind: String)
case captureFailed(String)
case exportFailed(String)
var errorDescription: String? {
switch self {
case .cameraUnavailable:
"Camera unavailable"
case .microphoneUnavailable:
"Microphone unavailable"
case let .permissionDenied(kind):
"\(kind) permission denied"
case let .captureFailed(msg):
msg
case let .exportFailed(msg):
msg
}
}
}
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "camera")
func snap(facing: CameraFacing?, maxWidth: Int?, quality: Double?) async throws -> (data: Data, size: CGSize) {
let facing = facing ?? .front
let maxWidth = maxWidth.flatMap { $0 > 0 ? $0 : nil }
let quality = Self.clampQuality(quality)
try await self.ensureAccess(for: .video)
let session = AVCaptureSession()
session.sessionPreset = .photo
guard let device = Self.pickCamera(facing: facing) else {
throw CameraError.cameraUnavailable
}
let input = try AVCaptureDeviceInput(device: device)
guard session.canAddInput(input) else {
throw CameraError.captureFailed("Failed to add camera input")
}
session.addInput(input)
let output = AVCapturePhotoOutput()
guard session.canAddOutput(output) else {
throw CameraError.captureFailed("Failed to add photo output")
}
session.addOutput(output)
output.maxPhotoQualityPrioritization = .quality
session.startRunning()
defer { session.stopRunning() }
let settings: AVCapturePhotoSettings = {
if output.availablePhotoCodecTypes.contains(.jpeg) {
return AVCapturePhotoSettings(format: [AVVideoCodecKey: AVVideoCodecType.jpeg])
}
return AVCapturePhotoSettings()
}()
settings.photoQualityPrioritization = .quality
let rawData: Data = try await withCheckedThrowingContinuation(isolation: nil) { cont in
output.capturePhoto(with: settings, delegate: PhotoCaptureDelegate(cont))
}
return try Self.reencodeJPEG(imageData: rawData, maxWidth: maxWidth, quality: quality)
}
func clip(
facing: CameraFacing?,
durationMs: Int?,
includeAudio: Bool,
outPath: String?) async throws -> (path: String, durationMs: Int, hasAudio: Bool)
{
let facing = facing ?? .front
let durationMs = Self.clampDurationMs(durationMs)
try await self.ensureAccess(for: .video)
if includeAudio {
try await self.ensureAccess(for: .audio)
}
let session = AVCaptureSession()
session.sessionPreset = .high
guard let camera = Self.pickCamera(facing: facing) else {
throw CameraError.cameraUnavailable
}
let cameraInput = try AVCaptureDeviceInput(device: camera)
guard session.canAddInput(cameraInput) else {
throw CameraError.captureFailed("Failed to add camera input")
}
session.addInput(cameraInput)
if includeAudio {
guard let mic = AVCaptureDevice.default(for: .audio) else {
throw CameraError.microphoneUnavailable
}
let micInput = try AVCaptureDeviceInput(device: mic)
guard session.canAddInput(micInput) else {
throw CameraError.captureFailed("Failed to add microphone input")
}
session.addInput(micInput)
}
let output = AVCaptureMovieFileOutput()
guard session.canAddOutput(output) else {
throw CameraError.captureFailed("Failed to add movie output")
}
session.addOutput(output)
output.maxRecordedDuration = CMTime(value: Int64(durationMs), timescale: 1000)
session.startRunning()
defer { session.stopRunning() }
let tmpMovURL = FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-\(UUID().uuidString).mov")
defer { try? FileManager.default.removeItem(at: tmpMovURL) }
let outputURL: URL = {
if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
return URL(fileURLWithPath: outPath)
}
return FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-\(UUID().uuidString).mp4")
}()
// Ensure we don't fail exporting due to an existing file.
try? FileManager.default.removeItem(at: outputURL)
let logger = self.logger
let recordedURL: URL = try await withCheckedThrowingContinuation(isolation: nil) { cont in
output.startRecording(to: tmpMovURL, recordingDelegate: MovieFileDelegate(cont, logger: logger))
}
try await Self.exportToMP4(inputURL: recordedURL, outputURL: outputURL)
return (path: outputURL.path, durationMs: durationMs, hasAudio: includeAudio)
}
private func ensureAccess(for mediaType: AVMediaType) async throws {
let status = AVCaptureDevice.authorizationStatus(for: mediaType)
switch status {
case .authorized:
return
case .notDetermined:
let ok = await withCheckedContinuation(isolation: nil) { cont in
AVCaptureDevice.requestAccess(for: mediaType) { granted in
cont.resume(returning: granted)
}
}
if !ok {
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
}
case .denied, .restricted:
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
@unknown default:
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
}
}
private nonisolated static func pickCamera(facing: CameraFacing) -> AVCaptureDevice? {
let position: AVCaptureDevice.Position = (facing == .front) ? .front : .back
if let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: position) {
return device
}
// Many macOS cameras report `unspecified` position; fall back to any default.
return AVCaptureDevice.default(for: .video)
}
private nonisolated static func clampQuality(_ quality: Double?) -> Double {
let q = quality ?? 0.9
return min(1.0, max(0.05, q))
}
private nonisolated static func clampDurationMs(_ ms: Int?) -> Int {
let v = ms ?? 3000
return min(15_000, max(250, v))
}
private nonisolated static func reencodeJPEG(
imageData: Data,
maxWidth: Int?,
quality: Double) throws -> (data: Data, size: CGSize)
{
guard let src = CGImageSourceCreateWithData(imageData as CFData, nil),
let img = CGImageSourceCreateImageAtIndex(src, 0, nil)
else {
throw CameraError.captureFailed("Failed to decode captured image")
}
let finalImage: CGImage
if let maxWidth, img.width > maxWidth {
guard let scaled = self.downscale(image: img, maxWidth: maxWidth) else {
throw CameraError.captureFailed("Failed to downscale image")
}
finalImage = scaled
} else {
finalImage = img
}
let out = NSMutableData()
guard let dest = CGImageDestinationCreateWithData(out, UTType.jpeg.identifier as CFString, 1, nil) else {
throw CameraError.captureFailed("Failed to create JPEG destination")
}
let props = [kCGImageDestinationLossyCompressionQuality: quality] as CFDictionary
CGImageDestinationAddImage(dest, finalImage, props)
guard CGImageDestinationFinalize(dest) else {
throw CameraError.captureFailed("Failed to encode JPEG")
}
return (out as Data, CGSize(width: finalImage.width, height: finalImage.height))
}
private nonisolated static func downscale(image: CGImage, maxWidth: Int) -> CGImage? {
guard image.width > 0, image.height > 0 else { return image }
guard image.width > maxWidth else { return image }
let scale = Double(maxWidth) / Double(image.width)
let targetW = maxWidth
let targetH = max(1, Int((Double(image.height) * scale).rounded()))
let cs = CGColorSpaceCreateDeviceRGB()
let bitmapInfo = CGImageAlphaInfo.premultipliedLast.rawValue
guard let ctx = CGContext(
data: nil,
width: targetW,
height: targetH,
bitsPerComponent: 8,
bytesPerRow: 0,
space: cs,
bitmapInfo: bitmapInfo)
else { return nil }
ctx.interpolationQuality = .high
ctx.draw(image, in: CGRect(x: 0, y: 0, width: targetW, height: targetH))
return ctx.makeImage()
}
private nonisolated static func exportToMP4(inputURL: URL, outputURL: URL) async throws {
let asset = AVAsset(url: inputURL)
guard let export = AVAssetExportSession(asset: asset, presetName: AVAssetExportPresetMediumQuality) else {
throw CameraError.exportFailed("Failed to create export session")
}
export.outputURL = outputURL
export.outputFileType = .mp4
export.shouldOptimizeForNetworkUse = true
await withCheckedContinuation { cont in
export.exportAsynchronously {
cont.resume()
}
}
switch export.status {
case .completed:
return
case .failed:
throw CameraError.exportFailed(export.error?.localizedDescription ?? "export failed")
case .cancelled:
throw CameraError.exportFailed("export cancelled")
default:
throw CameraError.exportFailed("export did not complete (\(export.status.rawValue))")
}
}
}
private final class PhotoCaptureDelegate: NSObject, AVCapturePhotoCaptureDelegate {
private var cont: CheckedContinuation<Data, Error>?
init(_ cont: CheckedContinuation<Data, Error>) {
self.cont = cont
}
func photoOutput(
_ output: AVCapturePhotoOutput,
didFinishProcessingPhoto photo: AVCapturePhoto,
error: Error?)
{
guard let cont else { return }
self.cont = nil
if let error {
cont.resume(throwing: error)
return
}
guard let data = photo.fileDataRepresentation() else {
cont.resume(throwing: CameraCaptureService.CameraError.captureFailed("No photo data"))
return
}
cont.resume(returning: data)
}
}
private final class MovieFileDelegate: NSObject, AVCaptureFileOutputRecordingDelegate {
private var cont: CheckedContinuation<URL, Error>?
private let logger: Logger
init(_ cont: CheckedContinuation<URL, Error>, logger: Logger) {
self.cont = cont
self.logger = logger
}
func fileOutput(
_ output: AVCaptureFileOutput,
didFinishRecordingTo outputFileURL: URL,
from connections: [AVCaptureConnection],
error: Error?)
{
guard let cont else { return }
self.cont = nil
if let error {
let ns = error as NSError
if ns.domain == AVFoundationErrorDomain,
ns.code == AVError.maximumDurationReached.rawValue
{
cont.resume(returning: outputFileURL)
return
}
self.logger.error("camera record failed: \(error.localizedDescription, privacy: .public)")
cont.resume(throwing: error)
return
}
cont.resume(returning: outputFileURL)
}
}

View File

@ -24,6 +24,7 @@ let webChatEnabledKey = "clawdis.webChatEnabled"
let webChatSwiftUIEnabledKey = "clawdis.webChatSwiftUIEnabled" let webChatSwiftUIEnabledKey = "clawdis.webChatSwiftUIEnabled"
let webChatPortKey = "clawdis.webChatPort" let webChatPortKey = "clawdis.webChatPort"
let canvasEnabledKey = "clawdis.canvasEnabled" let canvasEnabledKey = "clawdis.canvasEnabled"
let cameraEnabledKey = "clawdis.cameraEnabled"
let peekabooBridgeEnabledKey = "clawdis.peekabooBridgeEnabled" let peekabooBridgeEnabledKey = "clawdis.peekabooBridgeEnabled"
let deepLinkAgentEnabledKey = "clawdis.deepLinkAgentEnabled" let deepLinkAgentEnabledKey = "clawdis.deepLinkAgentEnabled"
let deepLinkKeyKey = "clawdis.deepLinkKey" let deepLinkKeyKey = "clawdis.deepLinkKey"

View File

@ -3,6 +3,8 @@ import Foundation
import OSLog import OSLog
enum ControlRequestHandler { enum ControlRequestHandler {
private static let cameraCapture = CameraCaptureService()
static func process( static func process(
request: Request, request: Request,
notifier: NotificationManager = NotificationManager(), notifier: NotificationManager = NotificationManager(),
@ -77,6 +79,16 @@ enum ControlRequestHandler {
command: command, command: command,
paramsJSON: paramsJSON, paramsJSON: paramsJSON,
logger: logger) logger: logger)
case let .cameraSnap(facing, maxWidth, quality, outPath):
return await self.handleCameraSnap(facing: facing, maxWidth: maxWidth, quality: quality, outPath: outPath)
case let .cameraClip(facing, durationMs, includeAudio, outPath):
return await self.handleCameraClip(
facing: facing,
durationMs: durationMs,
includeAudio: includeAudio,
outPath: outPath)
} }
} }
@ -173,6 +185,10 @@ enum ControlRequestHandler {
UserDefaults.standard.object(forKey: canvasEnabledKey) as? Bool ?? true UserDefaults.standard.object(forKey: canvasEnabledKey) as? Bool ?? true
} }
private static func cameraEnabled() -> Bool {
UserDefaults.standard.object(forKey: cameraEnabledKey) as? Bool ?? false
}
private static func handleCanvasShow( private static func handleCanvasShow(
session: String, session: String,
path: String?, path: String?,
@ -254,4 +270,46 @@ enum ControlRequestHandler {
return Response(ok: false, message: error.localizedDescription) return Response(ok: false, message: error.localizedDescription)
} }
} }
private static func handleCameraSnap(
facing: CameraFacing?,
maxWidth: Int?,
quality: Double?,
outPath: String?) async -> Response
{
guard self.cameraEnabled() else { return Response(ok: false, message: "Camera disabled by user") }
do {
let res = try await self.cameraCapture.snap(facing: facing, maxWidth: maxWidth, quality: quality)
let url: URL = if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
URL(fileURLWithPath: outPath)
} else {
FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-snap-\(UUID().uuidString).jpg")
}
try res.data.write(to: url, options: [.atomic])
return Response(ok: true, message: url.path)
} catch {
return Response(ok: false, message: error.localizedDescription)
}
}
private static func handleCameraClip(
facing: CameraFacing?,
durationMs: Int?,
includeAudio: Bool,
outPath: String?) async -> Response
{
guard self.cameraEnabled() else { return Response(ok: false, message: "Camera disabled by user") }
do {
let res = try await self.cameraCapture.clip(
facing: facing,
durationMs: durationMs,
includeAudio: includeAudio,
outPath: outPath)
return Response(ok: true, message: res.path)
} catch {
return Response(ok: false, message: error.localizedDescription)
}
}
} }

View File

@ -9,6 +9,7 @@ struct DebugSettings: View {
@AppStorage(modelCatalogReloadKey) private var modelCatalogReloadBump: Int = 0 @AppStorage(modelCatalogReloadKey) private var modelCatalogReloadBump: Int = 0
@AppStorage(iconOverrideKey) private var iconOverrideRaw: String = IconOverrideSelection.system.rawValue @AppStorage(iconOverrideKey) private var iconOverrideRaw: String = IconOverrideSelection.system.rawValue
@AppStorage(canvasEnabledKey) private var canvasEnabled: Bool = true @AppStorage(canvasEnabledKey) private var canvasEnabled: Bool = true
@AppStorage(cameraEnabledKey) private var cameraEnabled: Bool = false
@AppStorage(deepLinkAgentEnabledKey) private var deepLinkAgentEnabled: Bool = false @AppStorage(deepLinkAgentEnabledKey) private var deepLinkAgentEnabled: Bool = false
@State private var modelsCount: Int? @State private var modelsCount: Int?
@State private var modelsLoading = false @State private var modelsLoading = false
@ -48,6 +49,7 @@ struct DebugSettings: View {
self.pathsSection self.pathsSection
self.quickActionsSection self.quickActionsSection
self.canvasSection self.canvasSection
self.cameraSection
self.experimentsSection self.experimentsSection
Spacer(minLength: 0) Spacer(minLength: 0)
@ -571,6 +573,20 @@ struct DebugSettings: View {
} }
} }
private var cameraSection: some View {
GroupBox("Camera") {
VStack(alignment: .leading, spacing: 10) {
Toggle("Allow Camera (agent)", isOn: self.$cameraEnabled)
.toggleStyle(.checkbox)
.help("When off, camera requests return “Camera disabled by user”.")
Text("Allows Clawdis to capture a photo or short video via the built-in camera.")
.font(.caption)
.foregroundStyle(.secondary)
}
}
}
private var experimentsSection: some View { private var experimentsSection: some View {
GroupBox("Experiments") { GroupBox("Experiments") {
Grid(alignment: .leadingFirstTextBaseline, horizontalSpacing: 14, verticalSpacing: 10) { Grid(alignment: .leadingFirstTextBaseline, horizontalSpacing: 14, verticalSpacing: 10) {

View File

@ -52,6 +52,7 @@ struct ClawdisCLI {
enum Kind { enum Kind {
case generic case generic
case mediaPath
} }
} }
@ -91,6 +92,9 @@ struct ClawdisCLI {
case "canvas": case "canvas":
return try self.parseCanvas(args: &args) return try self.parseCanvas(args: &args)
case "camera":
return try self.parseCamera(args: &args)
default: default:
throw CLIError.help throw CLIError.help
} }
@ -292,6 +296,62 @@ struct ClawdisCLI {
} }
} }
private static func parseCamera(args: inout [String]) throws -> ParsedCLIRequest {
guard let sub = args.popFirst() else { throw CLIError.help }
switch sub {
case "snap":
var facing: CameraFacing?
var maxWidth: Int?
var quality: Double?
var outPath: String?
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
case "--facing":
if let val = args.popFirst(), let f = CameraFacing(rawValue: val) { facing = f }
case "--max-width":
maxWidth = args.popFirst().flatMap(Int.init)
case "--quality":
quality = args.popFirst().flatMap(Double.init)
case "--out":
outPath = args.popFirst()
default:
break
}
}
return ParsedCLIRequest(
request: .cameraSnap(facing: facing, maxWidth: maxWidth, quality: quality, outPath: outPath),
kind: .mediaPath)
case "clip":
var facing: CameraFacing?
var durationMs: Int?
var includeAudio = true
var outPath: String?
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
case "--facing":
if let val = args.popFirst(), let f = CameraFacing(rawValue: val) { facing = f }
case "--duration-ms":
durationMs = args.popFirst().flatMap(Int.init)
case "--no-audio":
includeAudio = false
case "--out":
outPath = args.popFirst()
default:
break
}
}
return ParsedCLIRequest(
request: .cameraClip(facing: facing, durationMs: durationMs, includeAudio: includeAudio, outPath: outPath),
kind: .mediaPath)
default:
throw CLIError.help
}
}
private static func parseCanvasPlacement( private static func parseCanvasPlacement(
args: inout [String], args: inout [String],
session: inout String, session: inout String,
@ -334,6 +394,10 @@ struct ClawdisCLI {
if let message = response.message, !message.isEmpty { if let message = response.message, !message.isEmpty {
FileHandle.standardOutput.write(Data((message + "\n").utf8)) FileHandle.standardOutput.write(Data((message + "\n").utf8))
} }
case .mediaPath:
if let message = response.message, !message.isEmpty {
print("MEDIA:\(message)")
}
} }
} }
@ -352,6 +416,8 @@ struct ClawdisCLI {
output["payload"] = text output["payload"] = text
} }
} }
case .mediaPath:
break
} }
let json = try JSONSerialization.data(withJSONObject: output, options: [.prettyPrinted]) let json = try JSONSerialization.data(withJSONObject: output, options: [.prettyPrinted])
@ -406,6 +472,10 @@ struct ClawdisCLI {
clawdis-mac canvas eval --js <code> [--session <key>] clawdis-mac canvas eval --js <code> [--session <key>]
clawdis-mac canvas snapshot [--out <path>] [--session <key>] clawdis-mac canvas snapshot [--out <path>] [--session <key>]
Camera:
clawdis-mac camera snap [--facing <front|back>] [--max-width <px>] [--quality <0-1>] [--out <path>]
clawdis-mac camera clip [--facing <front|back>] [--duration-ms <ms>] [--no-audio] [--out <path>]
Browser (clawd): Browser (clawd):
clawdis-mac browser status|start|stop|tabs|open|focus|close|screenshot|eval|query|dom|snapshot clawdis-mac browser status|start|stop|tabs|open|focus|close|screenshot|eval|query|dom|snapshot
@ -433,6 +503,7 @@ struct ClawdisCLI {
Output: Output:
Default output is text. Use --json for machine-readable output. Default output is text. Use --json for machine-readable output.
In text mode, `browser screenshot` prints MEDIA:<path>. In text mode, `browser screenshot` prints MEDIA:<path>.
In text mode, `camera snap` and `camera clip` print MEDIA:<path>.
""" """
print(usage) print(usage)
} }

View File

@ -13,6 +13,11 @@ public enum Capability: String, Codable, CaseIterable, Sendable {
case speechRecognition case speechRecognition
} }
public enum CameraFacing: String, Codable, Sendable {
case front
case back
}
// MARK: - Requests // MARK: - Requests
/// Notification interruption level (maps to UNNotificationInterruptionLevel) /// Notification interruption level (maps to UNNotificationInterruptionLevel)
@ -74,6 +79,8 @@ public enum Request: Sendable {
case canvasSnapshot(session: String, outPath: String?) case canvasSnapshot(session: String, outPath: String?)
case nodeList case nodeList
case nodeInvoke(nodeId: String, command: String, paramsJSON: String?) case nodeInvoke(nodeId: String, command: String, paramsJSON: String?)
case cameraSnap(facing: CameraFacing?, maxWidth: Int?, quality: Double?, outPath: String?)
case cameraClip(facing: CameraFacing?, durationMs: Int?, includeAudio: Bool, outPath: String?)
} }
// MARK: - Responses // MARK: - Responses
@ -104,6 +111,11 @@ extension Request: Codable {
case path case path
case javaScript case javaScript
case outPath case outPath
case facing
case maxWidth
case quality
case durationMs
case includeAudio
case placement case placement
case nodeId case nodeId
case nodeCommand case nodeCommand
@ -124,6 +136,8 @@ extension Request: Codable {
case canvasSnapshot case canvasSnapshot
case nodeList case nodeList
case nodeInvoke case nodeInvoke
case cameraSnap
case cameraClip
} }
public func encode(to encoder: Encoder) throws { public func encode(to encoder: Encoder) throws {
@ -198,6 +212,20 @@ extension Request: Codable {
try container.encode(nodeId, forKey: .nodeId) try container.encode(nodeId, forKey: .nodeId)
try container.encode(command, forKey: .nodeCommand) try container.encode(command, forKey: .nodeCommand)
try container.encodeIfPresent(paramsJSON, forKey: .paramsJSON) try container.encodeIfPresent(paramsJSON, forKey: .paramsJSON)
case let .cameraSnap(facing, maxWidth, quality, outPath):
try container.encode(Kind.cameraSnap, forKey: .type)
try container.encodeIfPresent(facing, forKey: .facing)
try container.encodeIfPresent(maxWidth, forKey: .maxWidth)
try container.encodeIfPresent(quality, forKey: .quality)
try container.encodeIfPresent(outPath, forKey: .outPath)
case let .cameraClip(facing, durationMs, includeAudio, outPath):
try container.encode(Kind.cameraClip, forKey: .type)
try container.encodeIfPresent(facing, forKey: .facing)
try container.encodeIfPresent(durationMs, forKey: .durationMs)
try container.encode(includeAudio, forKey: .includeAudio)
try container.encodeIfPresent(outPath, forKey: .outPath)
} }
} }
@ -274,6 +302,20 @@ extension Request: Codable {
let command = try container.decode(String.self, forKey: .nodeCommand) let command = try container.decode(String.self, forKey: .nodeCommand)
let paramsJSON = try container.decodeIfPresent(String.self, forKey: .paramsJSON) let paramsJSON = try container.decodeIfPresent(String.self, forKey: .paramsJSON)
self = .nodeInvoke(nodeId: nodeId, command: command, paramsJSON: paramsJSON) self = .nodeInvoke(nodeId: nodeId, command: command, paramsJSON: paramsJSON)
case .cameraSnap:
let facing = try container.decodeIfPresent(CameraFacing.self, forKey: .facing)
let maxWidth = try container.decodeIfPresent(Int.self, forKey: .maxWidth)
let quality = try container.decodeIfPresent(Double.self, forKey: .quality)
let outPath = try container.decodeIfPresent(String.self, forKey: .outPath)
self = .cameraSnap(facing: facing, maxWidth: maxWidth, quality: quality, outPath: outPath)
case .cameraClip:
let facing = try container.decodeIfPresent(CameraFacing.self, forKey: .facing)
let durationMs = try container.decodeIfPresent(Int.self, forKey: .durationMs)
let includeAudio = (try? container.decode(Bool.self, forKey: .includeAudio)) ?? true
let outPath = try container.decodeIfPresent(String.self, forKey: .outPath)
self = .cameraClip(facing: facing, durationMs: durationMs, includeAudio: includeAudio, outPath: outPath)
} }
} }
} }

View File

@ -0,0 +1,62 @@
import ClawdisIPC
import Foundation
import Testing
@Suite struct CameraIPCTests {
@Test func cameraSnapCodableRoundtrip() throws {
let req: Request = .cameraSnap(
facing: .front,
maxWidth: 640,
quality: 0.85,
outPath: "/tmp/test.jpg")
let data = try JSONEncoder().encode(req)
let decoded = try JSONDecoder().decode(Request.self, from: data)
switch decoded {
case let .cameraSnap(facing, maxWidth, quality, outPath):
#expect(facing == .front)
#expect(maxWidth == 640)
#expect(quality == 0.85)
#expect(outPath == "/tmp/test.jpg")
default:
Issue.record("expected cameraSnap, got \(decoded)")
}
}
@Test func cameraClipCodableRoundtrip() throws {
let req: Request = .cameraClip(
facing: .back,
durationMs: 3000,
includeAudio: false,
outPath: "/tmp/test.mp4")
let data = try JSONEncoder().encode(req)
let decoded = try JSONDecoder().decode(Request.self, from: data)
switch decoded {
case let .cameraClip(facing, durationMs, includeAudio, outPath):
#expect(facing == .back)
#expect(durationMs == 3000)
#expect(includeAudio == false)
#expect(outPath == "/tmp/test.mp4")
default:
Issue.record("expected cameraClip, got \(decoded)")
}
}
@Test func cameraClipDefaultsIncludeAudioToTrueWhenMissing() throws {
let json = """
{"type":"cameraClip","durationMs":1234}
"""
let decoded = try JSONDecoder().decode(Request.self, from: Data(json.utf8))
switch decoded {
case let .cameraClip(_, durationMs, includeAudio, _):
#expect(durationMs == 1234)
#expect(includeAudio == true)
default:
Issue.record("expected cameraClip, got \(decoded)")
}
}
}

View File

@ -0,0 +1,58 @@
import Foundation
public enum ClawdisCameraCommand: String, Codable, Sendable {
case snap = "camera.snap"
case clip = "camera.clip"
}
public enum ClawdisCameraFacing: String, Codable, Sendable {
case back
case front
}
public enum ClawdisCameraImageFormat: String, Codable, Sendable {
case jpg
case jpeg
}
public enum ClawdisCameraVideoFormat: String, Codable, Sendable {
case mp4
}
public struct ClawdisCameraSnapParams: Codable, Sendable, Equatable {
public var facing: ClawdisCameraFacing?
public var maxWidth: Int?
public var quality: Double?
public var format: ClawdisCameraImageFormat?
public init(
facing: ClawdisCameraFacing? = nil,
maxWidth: Int? = nil,
quality: Double? = nil,
format: ClawdisCameraImageFormat? = nil)
{
self.facing = facing
self.maxWidth = maxWidth
self.quality = quality
self.format = format
}
}
public struct ClawdisCameraClipParams: Codable, Sendable, Equatable {
public var facing: ClawdisCameraFacing?
public var durationMs: Int?
public var includeAudio: Bool?
public var format: ClawdisCameraVideoFormat?
public init(
facing: ClawdisCameraFacing? = nil,
durationMs: Int? = nil,
includeAudio: Bool? = nil,
format: ClawdisCameraVideoFormat? = nil)
{
self.facing = facing
self.durationMs = durationMs
self.includeAudio = includeAudio
self.format = format
}
}

View File

@ -9,7 +9,7 @@ read_when:
## What Clawdis Does ## What Clawdis Does
- Runs WhatsApp gateway + Pi coding agent so the assistant can read/write chats, fetch context, and run tools via the host Mac. - Runs WhatsApp gateway + Pi coding agent so the assistant can read/write chats, fetch context, and run tools via the host Mac.
- macOS app manages permissions (screen recording, notifications, microphone) and exposes a CLI helper `clawdis-mac` for scripts. - macOS app manages permissions (screen recording, notifications, microphone) and exposes a CLI helper `clawdis-mac` for scripts.
- Sessions are per-sender; heartbeats keep background tasks alive. - Direct chats collapse into the shared `main` session by default; groups stay isolated as `group:<jid>`; heartbeats keep background tasks alive.
## Core Tools (enable in Settings → Tools) ## Core Tools (enable in Settings → Tools)
- **mcporter** — MCP runtime/CLI to list, call, and sync Model Context Protocol servers. - **mcporter** — MCP runtime/CLI to list, call, and sync Model Context Protocol servers.

View File

@ -122,8 +122,8 @@
<span class="footer__sep">·</span> <span class="footer__sep">·</span>
<a href="https://github.com/steipete/clawdis">source</a> <a href="https://github.com/steipete/clawdis">source</a>
<span class="footer__sep">·</span> <span class="footer__sep">·</span>
<a href="https://www.npmjs.com/package/clawdis">npm</a> <a href="https://github.com/steipete/clawdis/releases">releases</a>
</div> </div>
<div class="footer__hint" aria-hidden="true"> <div class="footer__hint" aria-hidden="true">
tip: press <kbd>F2</kbd> (Mac: <kbd>fn</kbd>+<kbd>F2</kbd>) to flip tip: press <kbd>F2</kbd> (Mac: <kbd>fn</kbd>+<kbd>F2</kbd>) to flip
the universe the universe

98
docs/camera.md Normal file
View File

@ -0,0 +1,98 @@
---
summary: "Camera capture (iOS node + macOS app) for agent use: photos (jpg) and short video clips (mp4)"
read_when:
- Adding or modifying camera capture on iOS nodes or macOS
- Extending agent-accessible MEDIA temp-file workflows
---
# Camera capture (agent)
Clawdis supports **camera capture** for agent workflows:
- **iOS node** (paired via Gateway): capture a **photo** (`jpg`) or **short video clip** (`mp4`, with optional audio) via `node.invoke`.
- **macOS app** (local control socket): capture a **photo** (`jpg`) or **short video clip** (`mp4`, with optional audio) via `clawdis-mac`.
All camera access is gated behind **user-controlled settings**.
## iOS node
### User setting (default on)
- iOS Settings tab → **Camera****Allow Camera** (`camera.enabled`)
- Default: **on** (missing key is treated as enabled).
- When off: `camera.*` commands return `CAMERA_DISABLED`.
### Commands (via Gateway `node.invoke`)
- `camera.snap`
- Params:
- `facing`: `front|back` (default: `front`)
- `maxWidth`: number (optional)
- `quality`: `0..1` (optional; default `0.9`)
- `format`: currently `jpg`
- Response payload:
- `format: "jpg"`
- `base64: "<...>"`
- `width`, `height`
- `camera.clip`
- Params:
- `facing`: `front|back` (default: `front`)
- `durationMs`: number (default `3000`, clamped to a max)
- `includeAudio`: boolean (default `true`)
- `format`: currently `mp4`
- Response payload:
- `format: "mp4"`
- `base64: "<...>"`
- `durationMs`
- `hasAudio`
### Foreground requirement
Like `screen.*`, the iOS node only allows `camera.*` commands in the **foreground**. Background invocations return `NODE_BACKGROUND_UNAVAILABLE`.
### CLI helper (temp files + MEDIA)
The easiest way to get attachments is via the CLI helper, which writes decoded media to a temp file and prints `MEDIA:<path>`.
Examples:
```bash
clawdis nodes camera snap --node <id> # default: both front + back (2 MEDIA lines)
clawdis nodes camera snap --node <id> --facing front
clawdis nodes camera clip --node <id> --duration 3000
clawdis nodes camera clip --node <id> --no-audio
```
Notes:
- `nodes camera snap` defaults to **both** facings to give the agent both views.
- Output files are temporary (in the OS temp directory) unless you build your own wrapper.
## macOS app
### User setting (default off)
The macOS companion app exposes a checkbox:
- **Settings → Debug → Camera → Allow Camera (agent)** (`clawdis.cameraEnabled`)
- Default: **off**
- When off: camera requests return “Camera disabled by user”.
### CLI helper (local control socket)
The `clawdis-mac` helper talks to the running menu bar app over the local control socket.
Examples:
```bash
clawdis-mac camera snap # prints MEDIA:<path>
clawdis-mac camera snap --max-width 1280
clawdis-mac camera clip --duration-ms 3000 # prints MEDIA:<path>
clawdis-mac camera clip --no-audio
```
## Safety + practical limits
- Camera and microphone access trigger the usual OS permission prompts (and require usage strings in Info.plist).
- Video clips are intentionally short to avoid oversized bridge payloads (base64 overhead + WebSocket message limits).

View File

@ -24,9 +24,17 @@ Start conservative:
## Prerequisites ## Prerequisites
- Node **22+** - Node **22+**
- CLAWDIS installed: `npm install -g clawdis` - CLAWDIS available on PATH (recommended during development: from source + global link)
- A second phone number (SIM/eSIM/prepaid) for the assistant - A second phone number (SIM/eSIM/prepaid) for the assistant
From source (recommended while the npm package is still settling):
```bash
pnpm install
pnpm build
pnpm link --global
```
## The two-phone setup (recommended) ## The two-phone setup (recommended)
You want this: You want this:
@ -121,7 +129,7 @@ Example:
## Sessions and memory ## Sessions and memory
- Session files: `~/.clawdis/sessions/{{SessionId}}.jsonl` - Session files: `~/.clawdis/sessions/{{SessionId}}.jsonl`
- Session metadata (token usage, last route, etc): `~/.clawdis/sessions.json` - Session metadata (token usage, last route, etc): `~/.clawdis/sessions/sessions.json` (legacy: `~/.clawdis/sessions.json`)
- `/new` starts a fresh session for that chat (configurable via `resetTriggers`) - `/new` starts a fresh session for that chat (configurable via `resetTriggers`)
## Heartbeats (proactive mode) ## Heartbeats (proactive mode)

View File

@ -5,9 +5,10 @@ read_when:
--- ---
# Control channel API (newline-delimited JSON) # Control channel API (newline-delimited JSON)
**Deprecated:** superseded by the WebSocket Gateway protocol (`clawdis gateway`, see `docs/architecture.md` and `docs/gateway.md`). Use only for legacy builds predating the Gateway rollout. **Deprecated (historical):** superseded by the WebSocket Gateway protocol (`clawdis gateway`, see `docs/architecture.md` and `docs/gateway.md`).
Current builds use a WebSocket server on `ws://127.0.0.1:18789` and do **not** expose this TCP control channel.
Endpoint: `127.0.0.1:18789` (TCP, localhost only). Clients reach it via SSH port forward in remote mode. Legacy endpoint (if present in an older build): `127.0.0.1:18789` (TCP, localhost only), typically reached via SSH port forward in remote mode.
## Frame format ## Frame format
Each line is a JSON object. Two shapes exist: Each line is a JSON object. Two shapes exist:
@ -45,4 +46,4 @@ Each line is a JSON object. Two shapes exist:
4) For user toggles, send `set-heartbeats` and await response. 4) For user toggles, send `set-heartbeats` and await response.
## Backward compatibility ## Backward compatibility
- If the control port is unavailable (older gateway), the client may fall back to the legacy CLI path, but the intended path is to rely solely on this API. - If the control channel is unavailable: thats expected on modern builds. Use the Gateway WS protocol instead.

View File

@ -56,4 +56,4 @@ Notes:
## Known considerations ## Known considerations
- Heartbeats are intentionally skipped for groups to avoid noisy broadcasts. - Heartbeats are intentionally skipped for groups to avoid noisy broadcasts.
- Echo suppression uses the combined batch string; if you send identical text twice without mentions, only the first will get a response. - Echo suppression uses the combined batch string; if you send identical text twice without mentions, only the first will get a response.
- Session store entries will appear as `group:<jid>` in `sessions.json`; a missing entry just means the group hasnt triggered a run yet. - Session store entries will appear as `group:<jid>` in the session store (`~/.clawdis/sessions/sessions.json` by default); a missing entry just means the group hasnt triggered a run yet.

View File

@ -16,7 +16,7 @@ Short guide to verify the WhatsApp Web / Baileys stack without guessing.
## Deep diagnostics ## Deep diagnostics
- Creds on disk: `ls -l ~/.clawdis/credentials/creds.json` (mtime should be recent). - Creds on disk: `ls -l ~/.clawdis/credentials/creds.json` (mtime should be recent).
- Session store: `ls -l ~/.clawdis/sessions.json` (path can be overridden in config). Count and recent recipients are surfaced via `status`. - Session store: `ls -l ~/.clawdis/sessions/sessions.json` (legacy: `~/.clawdis/sessions.json`; path can be overridden in config). Count and recent recipients are surfaced via `status`.
- Relink flow: `clawdis logout && clawdis login --verbose` when status codes 409515 or `loggedOut` appear in logs. - Relink flow: `clawdis logout && clawdis login --verbose` when status codes 409515 or `loggedOut` appear in logs.
## When something fails ## When something fails

View File

@ -19,7 +19,7 @@ read_when:
<p align="center"> <p align="center">
<a href="https://github.com/steipete/clawdis">GitHub</a> · <a href="https://github.com/steipete/clawdis">GitHub</a> ·
<a href="https://www.npmjs.com/package/clawdis">npm</a> · <a href="https://github.com/steipete/clawdis/releases">Releases</a> ·
<a href="./clawd">Clawd setup</a> <a href="./clawd">Clawd setup</a>
</p> </p>
@ -29,25 +29,41 @@ Its built for [Clawd](https://clawd.me), a space lobster who needed a TARDIS.
## How it works ## How it works
``` ```
┌─────────────┐ ┌──────────┐ ┌─────────────┐ WhatsApp / Telegram
│ WhatsApp │ ───▶ │ CLAWDIS │ ───▶ │ AI Agent │
│ Telegram │ ───▶ │ 🦞⏱️💙 │ ◀─── │ (Pi) │
│ (You) │ ◀─── │ │ │ │ ┌──────────────────────────┐
└─────────────┘ └──────────┘ └─────────────┘ │ Gateway │ ws://127.0.0.1:18789 (loopback-only)
│ (single source) │ tcp://0.0.0.0:18790 (optional Bridge)
└───────────┬───────────────┘
├─ Pi agent (RPC)
├─ CLI (clawdis …)
├─ WebChat (loopback UI)
├─ macOS app (Clawdis.app)
└─ iOS node (Iris) via Bridge + pairing
``` ```
Most operations flow through the **Gateway** (`clawdis gateway`), a single long-running process that owns provider connections and the WebSocket control plane. Most operations flow through the **Gateway** (`clawdis gateway`), a single long-running process that owns provider connections and the WebSocket control plane.
## Network model
- **One Gateway per host**: it is the only process allowed to own the WhatsApp Web session.
- **Loopback-first**: Gateway WS is `ws://127.0.0.1:18789` (not exposed on the LAN).
- **Bridge for nodes**: optional LAN/tailnet-facing bridge on `tcp://0.0.0.0:18790` for paired nodes (Bonjour-discoverable).
- **Remote use**: SSH tunnel or tailnet/VPN; see `docs/remote.md` and `docs/discovery.md`.
## Features (high level) ## Features (high level)
- 📱 **WhatsApp Integration** — Uses Baileys for WhatsApp Web protocol - 📱 **WhatsApp Integration** — Uses Baileys for WhatsApp Web protocol
- ✈️ **Telegram Bot** — DMs + groups via grammY - ✈️ **Telegram Bot** — DMs + groups via grammY
- 🤖 **Agent bridge** — Pi (RPC mode) with tool streaming - 🤖 **Agent bridge** — Pi (RPC mode) with tool streaming
- 💬 **Sessions** — Per-sender (or shared `main`) conversation context - 💬 **Sessions**Direct chats collapse into shared `main` (default); groups are isolated
- 👥 **Group Chat Support** — Mention-based triggering in group chats - 👥 **Group Chat Support** — Mention-based triggering in group chats
- 📎 **Media Support** — Send and receive images, audio, documents - 📎 **Media Support** — Send and receive images, audio, documents
- 🎤 **Voice notes** — Optional transcription hook - 🎤 **Voice notes** — Optional transcription hook
- 🖥️ **WebChat + macOS app** — A local UI + menu bar companion for ops and voice wake - 🖥️ **WebChat + macOS app** — Local UI + menu bar companion for ops and voice wake
- 📱 **iOS node (Iris)** — Pairs as a node and exposes a Canvas surface
Note: legacy Claude/Codex/Gemini/Opencode paths have been removed; Pi is the only coding-agent path. Note: legacy Claude/Codex/Gemini/Opencode paths have been removed; Pi is the only coding-agent path.
@ -56,8 +72,10 @@ Note: legacy Claude/Codex/Gemini/Opencode paths have been removed; Pi is the onl
Runtime requirement: **Node ≥ 22**. Runtime requirement: **Node ≥ 22**.
```bash ```bash
# Install # From source (recommended while the npm package is still settling)
npm install -g clawdis pnpm install
pnpm build
pnpm link --global
# Pair WhatsApp Web (shows QR) # Pair WhatsApp Web (shows QR)
clawdis login clawdis login
@ -95,18 +113,23 @@ Example:
## Docs ## Docs
- [Configuration](./configuration.md) - Start here:
- [Gateway runbook](./gateway.md) - [Configuration](./configuration.md)
- [WebChat](./webchat.md) - [Clawd personal assistant setup](./clawd.md)
- [Agent integration](./agents.md) - [Gateway runbook](./gateway.md)
- [Telegram](./telegram.md) - [Discovery + transports](./discovery.md)
- [Group messages](./group-messages.md) - [Remote access](./remote.md)
- [Media: images](./images.md) - Providers and UX:
- [Media: audio](./audio.md) - [WebChat](./webchat.md)
- [Sessions](./session.md) - [Telegram](./telegram.md)
- [Cron + wakeups](./cron.md) - [Group messages](./group-messages.md)
- [Security](./security.md) - [Media: images](./images.md)
- [Troubleshooting](./troubleshooting.md) - [Media: audio](./audio.md)
- Ops and safety:
- [Sessions](./session.md)
- [Cron + wakeups](./cron.md)
- [Security](./security.md)
- [Troubleshooting](./troubleshooting.md)
## The name ## The name

View File

@ -54,13 +54,13 @@ More debugging notes: `docs/bonjour.md`.
In Iris: In Iris:
- Pick the discovered bridge (or hit refresh). - Pick the discovered bridge (or hit refresh).
- If not paired yet, Iris will initiate pairing automatically. - If not paired yet, Iris will initiate pairing automatically.
- After the first successful pairing, Iris will auto-reconnect to the **last bridge** on launch (including after reinstall), as long as the iOS Keychain entry is still present. - After the first successful pairing, Iris will auto-reconnect **strictly to the last discovered gateway** on launch (including after reinstall), as long as the iOS Keychain entry is still present.
### Connection indicator (always visible) ### Connection indicator (always visible)
The Settings tab icon shows a small status dot: The Settings tab icon shows a small status dot:
- **Green**: connected to the bridge - **Green**: connected to the bridge
- **Yellow**: connecting - **Yellow**: connecting (subtle pulse)
- **Red**: not connected / error - **Red**: not connected / error
## 4) Approve pairing (CLI) ## 4) Approve pairing (CLI)

View File

@ -10,7 +10,7 @@ Context: web chat currently lives in a WKWebView that loads the pi-web bundle. S
## Target state ## Target state
- Gateway WS adds methods: - Gateway WS adds methods:
- `chat.history { sessionKey }``{ sessionKey, messages[], thinkingLevel }` (reads the existing JSONL + sessions.json). - `chat.history { sessionKey }``{ sessionKey, messages[], thinkingLevel }` (reads the existing JSONL + session store).
- `chat.send { sessionKey, message, attachments?, thinking?, deliver?, timeoutMs<=30000, idempotencyKey }``res { runId, status:"accepted" }` or `res ok:false` on validation/timeout. - `chat.send { sessionKey, message, attachments?, thinking?, deliver?, timeoutMs<=30000, idempotencyKey }``res { runId, status:"accepted" }` or `res ok:false` on validation/timeout.
- Gateway WS emits `chat` events `{ runId, sessionKey, seq, state:"delta"|"final"|"error", message?, errorMessage?, usage?, stopReason? }`. Streaming is optional; minimum is a single `state:"final"` per send. - Gateway WS emits `chat` events `{ runId, sessionKey, seq, state:"delta"|"final"|"error", message?, errorMessage?, usage?, stopReason? }`. Streaming is optional; minimum is a single `state:"final"` per send.
- Client consumes only WS: bootstrap via `chat.history`, send via `chat.send`, live updates via `chat` events. No file watchers. - Client consumes only WS: bootstrap via `chat.history`, send via `chat.send`, live updates via `chat` events. No file watchers.

View File

@ -3,48 +3,50 @@ summary: "Remote mode topology using SSH control channels between gateway and ma
read_when: read_when:
- Running or troubleshooting remote gateway setups - Running or troubleshooting remote gateway setups
--- ---
# Remote mode with control channel # Remote access (SSH, tunnels, and tailnets)
This repo supports “remote over SSH” by keeping a single gateway (the master) running on a host (e.g., your Mac Studio) and connecting one or more macOS menu bar clients to it. The menu app no longer shells out to `pnpm clawdis …`; it talks to the gateway over a persistent control channel that is tunneled through SSH. This repo supports “remote over SSH” by keeping a single Gateway (the master) running on a host (e.g., your Mac Studio) and connecting clients to it.
Remote mode is the SSH fallback transport. As Clawdis adds a direct “bridge” transport for LAN/tailnet setups, SSH remains supported for universal reach. - For **operators (you / the macOS app)**: SSH tunneling is the universal fallback.
See `docs/discovery.md` for how clients choose between direct vs SSH. - For **nodes (Iris/iOS and future devices)**: prefer the Gateway **Bridge** when on the same LAN/tailnet (see `docs/discovery.md`).
## Topology ## The core idea
- Master: runs the gateway + control server on `127.0.0.1:18789` (in-process TCP server).
- Clients: when “Remote over SSH” is selected, the app opens one SSH tunnel:
- `ssh -N -L <localPort>:127.0.0.1:18789 <user>@<host>`
- The app then connects to `localhost:<localPort>` and keeps that socket open.
- Messages are newline-delimited JSON (documented in `docs/control-api.md`).
## Connection flow (clients) - The Gateway WebSocket binds to **loopback**: `ws://127.0.0.1:18789`.
1) Establish SSH tunnel. - For remote use, you forward that loopback port over SSH (or use a tailnet/VPN and tunnel less).
2) Open TCP socket to the local forwarded port.
3) Send `ping` to verify connectivity.
4) Issue `health`, `status`, and `last-heartbeat` requests to seed UI.
5) Listen for `event` frames (heartbeat updates, gateway status).
## Heartbeats ## SSH tunnel (CLI + tools)
- Heartbeats always run on the master gateway.
- The control server emits `event: "heartbeat"` after each heartbeat attempt and keeps the latest in memory for `last-heartbeat` requests.
- No file-based heartbeat logs/state are required when the control stream is available.
## Local mode Create a local tunnel to the remote Gateway WS:
- The menu app skips SSH and connects directly to `127.0.0.1:18789` with the same protocol.
## Failure handling ```bash
- If the tunnel drops, the client reconnects and re-issues `ping`, `health`, and `last-heartbeat` to refresh state (the mac app shows “Control channel disconnected”). ssh -N -L 18789:127.0.0.1:18789 user@host
- If the control port is unavailable (older gateway), the app can optionally fall back to the legacy CLI path, but the goal is to rely solely on the control channel. ```
## Test Remote (in the mac app) With the tunnel up:
1) SSH reachability check (`ssh -o BatchMode=yes … echo ok`). - `clawdis health` and `clawdis status --deep` now reach the remote gateway via `ws://127.0.0.1:18789`.
2) If SSH succeeds, the app opens the control tunnel and issues a `health` request; success marks the remote as ready. - `clawdis gateway {status,health,send,agent,call}` can also target the forwarded URL via `--url` when needed.
## Security ## WebChat over SSH
- Control server listens only on localhost.
- SSH tunneling reuses existing keys/agent; no additional auth is added by the control server.
## Files to keep in sync Forward both the WebChat HTTP port and the Gateway WS port:
- Protocol definition: `docs/control-api.md`.
- App connection logic: macOS `Remote over SSH` plumbing. ```bash
- Gateway control server: lives inside the Node gateway process. ssh -N \
-L 18788:127.0.0.1:18788 \
-L 18789:127.0.0.1:18789 \
user@host
```
Then open `http://127.0.0.1:18788/webchat/` locally. (Details: `docs/webchat.md`.)
## macOS app “Remote over SSH”
The macOS menu bar app can drive the same setup end-to-end (remote status checks, WebChat, and Voice Wake forwarding).
Runbook: `docs/mac/remote.md`.
## Legacy control channel
Older builds experimented with a newline-delimited TCP control channel on the same port.
That API is deprecated and should not be relied on. (Historical reference: `docs/control-api.md`.)

View File

@ -7,7 +7,7 @@ read_when:
Updated: 2025-12-07 Updated: 2025-12-07
Status: ready for bot-mode use with grammY (long-poll + webhook). Text + media send, proxy, and webhook helpers all ship in-tree. Status: ready for bot-mode use with grammY (long-polling by default; webhook supported when configured). Text + media send, mention-gated group replies, and optional proxy support are implemented.
## Goals ## Goals
- Let you talk to Clawdis via a Telegram bot in DMs and groups. - Let you talk to Clawdis via a Telegram bot in DMs and groups.
@ -17,7 +17,11 @@ Status: ready for bot-mode use with grammY (long-poll + webhook). Text + media s
## How it will work (Bot API) ## How it will work (Bot API)
1) Create a bot with @BotFather and grab the token. 1) Create a bot with @BotFather and grab the token.
2) Configure Clawdis with `TELEGRAM_BOT_TOKEN` (or `telegram.botToken` in `~/.clawdis/clawdis.json`). 2) Configure Clawdis with `TELEGRAM_BOT_TOKEN` (or `telegram.botToken` in `~/.clawdis/clawdis.json`).
3) Run the gateway; it auto-starts Telegram when the bot token is set. To force Telegram-only: `clawdis gateway --provider telegram`. Webhook mode: `clawdis gateway --provider telegram --webhook --port 8787 --webhook-secret <secret>` (optionally `--webhook-url` when the public URL differs). 3) Run the gateway; it auto-starts Telegram when the bot token is set.
- **Long-polling** is the default.
- **Webhook mode** is enabled by setting `telegram.webhookUrl` (optionally `telegram.webhookSecret` / `telegram.webhookPath`).
- The webhook listener currently binds to `0.0.0.0:8787` and serves `POST /telegram-webhook` by default.
- If you need a different public port/host, set `telegram.webhookUrl` to the externally reachable URL and use a reverse proxy to forward to `:8787`.
4) Direct chats: user sends the first message; all subsequent turns land in the shared `main` session (default, no extra config). 4) Direct chats: user sends the first message; all subsequent turns land in the shared `main` session (default, no extra config).
5) Groups: add the bot, disable privacy mode (or make it admin) so it can read messages; group threads stay on `group:<chatId>` and require mention/command to trigger replies. 5) Groups: add the bot, disable privacy mode (or make it admin) so it can read messages; group threads stay on `group:<chatId>` and require mention/command to trigger replies.
6) Optional allowlist: reuse `inbound.allowFrom` for direct chats by chat id (`123456789` or `telegram:123456789`). 6) Optional allowlist: reuse `inbound.allowFrom` for direct chats by chat id (`123456789` or `telegram:123456789`).
@ -32,7 +36,7 @@ Status: ready for bot-mode use with grammY (long-poll + webhook). Text + media s
- Library: grammY is the only client for send + gateway (fetch fallback removed); grammY throttler is enabled by default to stay under Bot API limits. - Library: grammY is the only client for send + gateway (fetch fallback removed); grammY throttler is enabled by default to stay under Bot API limits.
- Inbound normalization: maps Bot API updates to `MsgContext` with `Surface: "telegram"`, `ChatType: direct|group`, `SenderName`, `MediaPath`/`MediaType` when attachments arrive, and `Timestamp`; groups require @bot mention by default. - Inbound normalization: maps Bot API updates to `MsgContext` with `Surface: "telegram"`, `ChatType: direct|group`, `SenderName`, `MediaPath`/`MediaType` when attachments arrive, and `Timestamp`; groups require @bot mention by default.
- Outbound: text and media (photo/video/audio/document) with optional caption; chunked to limits. Typing cue sent best-effort. - Outbound: text and media (photo/video/audio/document) with optional caption; chunked to limits. Typing cue sent best-effort.
- Config: `TELEGRAM_BOT_TOKEN` env or `telegram.botToken` required; `telegram.requireMention`, `telegram.allowFrom`, `telegram.mediaMaxMb`, `telegram.proxy`, `telegram.webhookSecret`, `telegram.webhookUrl` supported. - Config: `TELEGRAM_BOT_TOKEN` env or `telegram.botToken` required; `telegram.requireMention`, `telegram.allowFrom`, `telegram.mediaMaxMb`, `telegram.proxy`, `telegram.webhookSecret`, `telegram.webhookUrl`, `telegram.webhookPath` supported.
Example config: Example config:
```json5 ```json5
@ -44,6 +48,7 @@ Example config:
mediaMaxMb: 5, mediaMaxMb: 5,
proxy: "socks5://localhost:9050", proxy: "socks5://localhost:9050",
webhookSecret: "mysecret", webhookSecret: "mysecret",
webhookPath: "/telegram-webhook",
webhookUrl: "https://yourdomain.com/telegram-webhook" webhookUrl: "https://yourdomain.com/telegram-webhook"
} }
} }
@ -62,6 +67,6 @@ Example config:
- ⏳ Add more grammY coverage (webhook payloads, media edge cases) - ⏳ Add more grammY coverage (webhook payloads, media edge cases)
## Safety & ops ## Safety & ops
- Treat the bot token as a secret (equivalent to account control); store under `~/.clawdis/credentials/` with 0600 perms. - Treat the bot token as a secret (equivalent to account control); prefer `TELEGRAM_BOT_TOKEN` or a locked-down config file (`chmod 600 ~/.clawdis/clawdis.json`).
- Respect Telegram rate limits (429s); well add throttling in the provider to stay below flood thresholds. - Respect Telegram rate limits (429s); grammY throttling is enabled by default.
- Use a test bot for development to avoid hitting production chats. - Use a test bot for development to avoid hitting production chats.

View File

@ -98,6 +98,8 @@ cat > "$APP_ROOT/Contents/Info.plist" <<PLIST
<string>Clawdis needs notification permission to show alerts for agent actions.</string> <string>Clawdis needs notification permission to show alerts for agent actions.</string>
<key>NSScreenCaptureDescription</key> <key>NSScreenCaptureDescription</key>
<string>Clawdis captures the screen when the agent needs screenshots for context.</string> <string>Clawdis captures the screen when the agent needs screenshots for context.</string>
<key>NSCameraUsageDescription</key>
<string>Clawdis can capture photos or short video clips when requested by the agent.</string>
<key>NSMicrophoneUsageDescription</key> <key>NSMicrophoneUsageDescription</key>
<string>Clawdis needs the mic for Voice Wake tests and agent audio capture.</string> <string>Clawdis needs the mic for Voice Wake tests and agent audio capture.</string>
<key>NSSpeechRecognitionUsageDescription</key> <key>NSSpeechRecognitionUsageDescription</key>

View File

@ -0,0 +1,64 @@
import * as fs from "node:fs/promises";
import * as os from "node:os";
import * as path from "node:path";
import { describe, expect, it } from "vitest";
import {
cameraTempPath,
parseCameraClipPayload,
parseCameraSnapPayload,
writeBase64ToFile,
} from "./nodes-camera.js";
describe("nodes camera helpers", () => {
it("parses camera.snap payload", () => {
expect(
parseCameraSnapPayload({
format: "jpg",
base64: "aGk=",
width: 10,
height: 20,
}),
).toEqual({ format: "jpg", base64: "aGk=", width: 10, height: 20 });
});
it("rejects invalid camera.snap payload", () => {
expect(() => parseCameraSnapPayload({ format: "jpg" })).toThrow(
/invalid camera\.snap payload/i,
);
});
it("parses camera.clip payload", () => {
expect(
parseCameraClipPayload({
format: "mp4",
base64: "AAEC",
durationMs: 1234,
hasAudio: true,
}),
).toEqual({
format: "mp4",
base64: "AAEC",
durationMs: 1234,
hasAudio: true,
});
});
it("builds stable temp paths when id provided", () => {
const p = cameraTempPath({
kind: "snap",
facing: "front",
ext: "jpg",
tmpDir: "/tmp",
id: "id1",
});
expect(p).toBe(path.join("/tmp", "clawdis-camera-snap-front-id1.jpg"));
});
it("writes base64 to file", async () => {
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdis-test-"));
const out = path.join(dir, "x.bin");
await writeBase64ToFile(out, "aGk=");
await expect(fs.readFile(out, "utf8")).resolves.toBe("hi");
await fs.rm(dir, { recursive: true, force: true });
});
});

92
src/cli/nodes-camera.ts Normal file
View File

@ -0,0 +1,92 @@
import { randomUUID } from "node:crypto";
import * as fs from "node:fs/promises";
import * as os from "node:os";
import * as path from "node:path";
export type CameraFacing = "front" | "back";
export type CameraSnapPayload = {
format: string;
base64: string;
width: number;
height: number;
};
export type CameraClipPayload = {
format: string;
base64: string;
durationMs: number;
hasAudio: boolean;
};
function asRecord(value: unknown): Record<string, unknown> {
return typeof value === "object" && value !== null
? (value as Record<string, unknown>)
: {};
}
function asString(value: unknown): string | undefined {
return typeof value === "string" ? value : undefined;
}
function asNumber(value: unknown): number | undefined {
return typeof value === "number" && Number.isFinite(value)
? value
: undefined;
}
function asBoolean(value: unknown): boolean | undefined {
return typeof value === "boolean" ? value : undefined;
}
export function parseCameraSnapPayload(value: unknown): CameraSnapPayload {
const obj = asRecord(value);
const format = asString(obj.format);
const base64 = asString(obj.base64);
const width = asNumber(obj.width);
const height = asNumber(obj.height);
if (!format || !base64 || width === undefined || height === undefined) {
throw new Error("invalid camera.snap payload");
}
return { format, base64, width, height };
}
export function parseCameraClipPayload(value: unknown): CameraClipPayload {
const obj = asRecord(value);
const format = asString(obj.format);
const base64 = asString(obj.base64);
const durationMs = asNumber(obj.durationMs);
const hasAudio = asBoolean(obj.hasAudio);
if (
!format ||
!base64 ||
durationMs === undefined ||
hasAudio === undefined
) {
throw new Error("invalid camera.clip payload");
}
return { format, base64, durationMs, hasAudio };
}
export function cameraTempPath(opts: {
kind: "snap" | "clip";
facing?: CameraFacing;
ext: string;
tmpDir?: string;
id?: string;
}) {
const tmpDir = opts.tmpDir ?? os.tmpdir();
const id = opts.id ?? randomUUID();
const facingPart = opts.facing ? `-${opts.facing}` : "";
const ext = opts.ext.startsWith(".") ? opts.ext : `.${opts.ext}`;
return path.join(
tmpDir,
`clawdis-camera-${opts.kind}${facingPart}-${id}${ext}`,
);
}
export async function writeBase64ToFile(filePath: string, base64: string) {
const buf = Buffer.from(base64, "base64");
await fs.writeFile(filePath, buf);
return { path: filePath, bytes: buf.length };
}

View File

@ -1,6 +1,13 @@
import type { Command } from "commander"; import type { Command } from "commander";
import { callGateway, randomIdempotencyKey } from "../gateway/call.js"; import { callGateway, randomIdempotencyKey } from "../gateway/call.js";
import { defaultRuntime } from "../runtime.js"; import { defaultRuntime } from "../runtime.js";
import {
type CameraFacing,
cameraTempPath,
parseCameraClipPayload,
parseCameraSnapPayload,
writeBase64ToFile,
} from "./nodes-camera.js";
type NodesRpcOpts = { type NodesRpcOpts = {
url?: string; url?: string;
@ -12,6 +19,11 @@ type NodesRpcOpts = {
params?: string; params?: string;
invokeTimeout?: string; invokeTimeout?: string;
idempotencyKey?: string; idempotencyKey?: string;
facing?: string;
maxWidth?: string;
quality?: string;
duration?: string;
audio?: boolean;
}; };
type NodeListNode = { type NodeListNode = {
@ -340,4 +352,203 @@ export function registerNodesCli(program: Command) {
}), }),
{ timeoutMs: 30_000 }, { timeoutMs: 30_000 },
); );
const parseFacing = (value: string): CameraFacing => {
const v = String(value ?? "")
.trim()
.toLowerCase();
if (v === "front" || v === "back") return v;
throw new Error(`invalid facing: ${value} (expected front|back)`);
};
const camera = nodes
.command("camera")
.description("Capture camera media from a paired node");
nodesCallOpts(
camera
.command("snap")
.description("Capture a photo from a node camera (prints MEDIA:<path>)")
.requiredOption("--node <idOrNameOrIp>", "Node id, name, or IP")
.option("--facing <front|back|both>", "Camera facing", "both")
.option("--max-width <px>", "Max width in px (optional)")
.option("--quality <0-1>", "JPEG quality (default 0.9)")
.option(
"--invoke-timeout <ms>",
"Node invoke timeout in ms (default 20000)",
"20000",
)
.action(async (opts: NodesRpcOpts) => {
try {
const nodeId = await resolveNodeId(opts, String(opts.node ?? ""));
const facingOpt = String(opts.facing ?? "both")
.trim()
.toLowerCase();
const facings: CameraFacing[] =
facingOpt === "both"
? ["front", "back"]
: facingOpt === "front" || facingOpt === "back"
? [facingOpt]
: (() => {
throw new Error(
`invalid facing: ${String(opts.facing)} (expected front|back|both)`,
);
})();
const maxWidth = opts.maxWidth
? Number.parseInt(String(opts.maxWidth), 10)
: undefined;
const quality = opts.quality
? Number.parseFloat(String(opts.quality))
: undefined;
const timeoutMs = opts.invokeTimeout
? Number.parseInt(String(opts.invokeTimeout), 10)
: undefined;
const results: Array<{
facing: CameraFacing;
path: string;
width: number;
height: number;
}> = [];
for (const facing of facings) {
const invokeParams: Record<string, unknown> = {
nodeId,
command: "camera.snap",
params: {
facing,
maxWidth: Number.isFinite(maxWidth) ? maxWidth : undefined,
quality: Number.isFinite(quality) ? quality : undefined,
format: "jpg",
},
idempotencyKey: randomIdempotencyKey(),
};
if (typeof timeoutMs === "number" && Number.isFinite(timeoutMs)) {
invokeParams.timeoutMs = timeoutMs;
}
const raw = (await callGatewayCli(
"node.invoke",
opts,
invokeParams,
)) as unknown;
const res =
typeof raw === "object" && raw !== null
? (raw as { payload?: unknown })
: {};
const payload = parseCameraSnapPayload(res.payload);
const filePath = cameraTempPath({
kind: "snap",
facing,
ext: payload.format === "jpeg" ? "jpg" : payload.format,
});
await writeBase64ToFile(filePath, payload.base64);
results.push({
facing,
path: filePath,
width: payload.width,
height: payload.height,
});
}
if (opts.json) {
defaultRuntime.log(JSON.stringify({ files: results }, null, 2));
return;
}
defaultRuntime.log(results.map((r) => `MEDIA:${r.path}`).join("\n"));
} catch (err) {
defaultRuntime.error(`nodes camera snap failed: ${String(err)}`);
defaultRuntime.exit(1);
}
}),
{ timeoutMs: 60_000 },
);
nodesCallOpts(
camera
.command("clip")
.description(
"Capture a short video clip from a node camera (prints MEDIA:<path>)",
)
.requiredOption("--node <idOrNameOrIp>", "Node id, name, or IP")
.option("--facing <front|back>", "Camera facing", "front")
.option("--duration <ms>", "Duration in ms (default 3000)", "3000")
.option("--no-audio", "Disable audio capture")
.option(
"--invoke-timeout <ms>",
"Node invoke timeout in ms (default 45000)",
"45000",
)
.action(async (opts: NodesRpcOpts & { audio?: boolean }) => {
try {
const nodeId = await resolveNodeId(opts, String(opts.node ?? ""));
const facing = parseFacing(String(opts.facing ?? "front"));
const durationMs = Number.parseInt(
String(opts.duration ?? "3000"),
10,
);
const includeAudio = opts.audio !== false;
const timeoutMs = opts.invokeTimeout
? Number.parseInt(String(opts.invokeTimeout), 10)
: undefined;
const invokeParams: Record<string, unknown> = {
nodeId,
command: "camera.clip",
params: {
facing,
durationMs: Number.isFinite(durationMs) ? durationMs : undefined,
includeAudio,
format: "mp4",
},
idempotencyKey: randomIdempotencyKey(),
};
if (typeof timeoutMs === "number" && Number.isFinite(timeoutMs)) {
invokeParams.timeoutMs = timeoutMs;
}
const raw = (await callGatewayCli(
"node.invoke",
opts,
invokeParams,
)) as unknown;
const res =
typeof raw === "object" && raw !== null
? (raw as { payload?: unknown })
: {};
const payload = parseCameraClipPayload(res.payload);
const filePath = cameraTempPath({
kind: "clip",
facing,
ext: payload.format,
});
await writeBase64ToFile(filePath, payload.base64);
if (opts.json) {
defaultRuntime.log(
JSON.stringify(
{
file: {
facing,
path: filePath,
durationMs: payload.durationMs,
hasAudio: payload.hasAudio,
},
},
null,
2,
),
);
return;
}
defaultRuntime.log(`MEDIA:${filePath}`);
} catch (err) {
defaultRuntime.error(`nodes camera clip failed: ${String(err)}`);
defaultRuntime.exit(1);
}
}),
{ timeoutMs: 90_000 },
);
} }

View File

@ -1,3 +1,4 @@
import * as fs from "node:fs/promises";
import { beforeEach, describe, expect, it, vi } from "vitest"; import { beforeEach, describe, expect, it, vi } from "vitest";
const sendCommand = vi.fn(); const sendCommand = vi.fn();
@ -148,4 +149,145 @@ describe("cli program", () => {
); );
expect(runtime.log).toHaveBeenCalled(); expect(runtime.log).toHaveBeenCalled();
}); });
it("runs nodes camera snap and prints two MEDIA paths", async () => {
callGateway
.mockResolvedValueOnce({
ts: Date.now(),
nodes: [
{
nodeId: "ios-node",
displayName: "iOS Node",
remoteIp: "192.168.0.88",
connected: true,
},
],
})
.mockResolvedValueOnce({
ok: true,
nodeId: "ios-node",
command: "camera.snap",
payload: { format: "jpg", base64: "aGk=", width: 1, height: 1 },
})
.mockResolvedValueOnce({
ok: true,
nodeId: "ios-node",
command: "camera.snap",
payload: { format: "jpg", base64: "aGk=", width: 1, height: 1 },
});
const program = buildProgram();
runtime.log.mockClear();
await program.parseAsync(
["nodes", "camera", "snap", "--node", "ios-node"],
{
from: "user",
},
);
expect(callGateway).toHaveBeenNthCalledWith(
2,
expect.objectContaining({
method: "node.invoke",
params: expect.objectContaining({
nodeId: "ios-node",
command: "camera.snap",
timeoutMs: 20000,
idempotencyKey: "idem-test",
params: expect.objectContaining({ facing: "front", format: "jpg" }),
}),
}),
);
expect(callGateway).toHaveBeenNthCalledWith(
3,
expect.objectContaining({
method: "node.invoke",
params: expect.objectContaining({
nodeId: "ios-node",
command: "camera.snap",
timeoutMs: 20000,
idempotencyKey: "idem-test",
params: expect.objectContaining({ facing: "back", format: "jpg" }),
}),
}),
);
const out = String(runtime.log.mock.calls[0]?.[0] ?? "");
const mediaPaths = out
.split("\n")
.filter((l) => l.startsWith("MEDIA:"))
.map((l) => l.replace(/^MEDIA:/, ""))
.filter(Boolean);
expect(mediaPaths).toHaveLength(2);
try {
for (const p of mediaPaths) {
await expect(fs.readFile(p, "utf8")).resolves.toBe("hi");
}
} finally {
await Promise.all(mediaPaths.map((p) => fs.unlink(p).catch(() => {})));
}
});
it("runs nodes camera clip and prints one MEDIA path", async () => {
callGateway
.mockResolvedValueOnce({
ts: Date.now(),
nodes: [
{
nodeId: "ios-node",
displayName: "iOS Node",
remoteIp: "192.168.0.88",
connected: true,
},
],
})
.mockResolvedValueOnce({
ok: true,
nodeId: "ios-node",
command: "camera.clip",
payload: {
format: "mp4",
base64: "aGk=",
durationMs: 3000,
hasAudio: true,
},
});
const program = buildProgram();
runtime.log.mockClear();
await program.parseAsync(
["nodes", "camera", "clip", "--node", "ios-node", "--duration", "3000"],
{ from: "user" },
);
expect(callGateway).toHaveBeenNthCalledWith(
2,
expect.objectContaining({
method: "node.invoke",
params: expect.objectContaining({
nodeId: "ios-node",
command: "camera.clip",
timeoutMs: 45000,
idempotencyKey: "idem-test",
params: expect.objectContaining({
facing: "front",
durationMs: 3000,
includeAudio: true,
format: "mp4",
}),
}),
}),
);
const out = String(runtime.log.mock.calls[0]?.[0] ?? "");
const mediaPath = out.replace(/^MEDIA:/, "").trim();
expect(mediaPath).toMatch(/clawdis-camera-clip-front-.*\.mp4$/);
try {
await expect(fs.readFile(mediaPath, "utf8")).resolves.toBe("hi");
} finally {
await fs.unlink(mediaPath).catch(() => {});
}
});
}); });