VoiceWake: capture utterance and add prefix
parent
6415ae79be
commit
a6e0ec38e7
|
|
@ -178,15 +178,24 @@ final class AppState: ObservableObject {
|
||||||
Task { await VoiceWakeRuntime.shared.refresh(state: self) }
|
Task { await VoiceWakeRuntime.shared.refresh(state: self) }
|
||||||
}
|
}
|
||||||
|
|
||||||
func triggerVoiceEars(ttl: TimeInterval = 5) {
|
func triggerVoiceEars(ttl: TimeInterval? = 5) {
|
||||||
self.earBoostTask?.cancel()
|
self.earBoostTask?.cancel()
|
||||||
self.earBoostActive = true
|
self.earBoostActive = true
|
||||||
|
|
||||||
|
guard let ttl else { return }
|
||||||
|
|
||||||
self.earBoostTask = Task { [weak self] in
|
self.earBoostTask = Task { [weak self] in
|
||||||
try? await Task.sleep(nanoseconds: UInt64(ttl * 1_000_000_000))
|
try? await Task.sleep(nanoseconds: UInt64(ttl * 1_000_000_000))
|
||||||
await MainActor.run { [weak self] in self?.earBoostActive = false }
|
await MainActor.run { [weak self] in self?.earBoostActive = false }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func stopVoiceEars() {
|
||||||
|
self.earBoostTask?.cancel()
|
||||||
|
self.earBoostTask = nil
|
||||||
|
self.earBoostActive = false
|
||||||
|
}
|
||||||
|
|
||||||
func setVoiceWakeEnabled(_ enabled: Bool) async {
|
func setVoiceWakeEnabled(_ enabled: Bool) async {
|
||||||
guard voiceWakeSupported else {
|
guard voiceWakeSupported else {
|
||||||
self.swabbleEnabled = false
|
self.swabbleEnabled = false
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,19 @@ enum VoiceWakeForwarder {
|
||||||
private static let cliSearchCandidates = ["clawdis-mac"] + cliHelperSearchPaths.map { "\($0)/clawdis-mac" }
|
private static let cliSearchCandidates = ["clawdis-mac"] + cliHelperSearchPaths.map { "\($0)/clawdis-mac" }
|
||||||
private static let cliCache = CLICache()
|
private static let cliCache = CLICache()
|
||||||
|
|
||||||
|
static func prefixedTranscript(_ transcript: String, machineName: String? = nil) -> String {
|
||||||
|
let resolvedMachine = machineName
|
||||||
|
.flatMap { name -> String? in
|
||||||
|
let trimmed = name.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||||
|
return trimmed.isEmpty ? nil : trimmed
|
||||||
|
}
|
||||||
|
?? Host.current().localizedName
|
||||||
|
?? ProcessInfo.processInfo.hostName
|
||||||
|
|
||||||
|
let safeMachine = resolvedMachine.isEmpty ? "this Mac" : resolvedMachine
|
||||||
|
return "User talked via voice recognition on \(safeMachine) - repeat prompt first + remember some words might be incorrectly transcribed.\n\n\(transcript)"
|
||||||
|
}
|
||||||
|
|
||||||
static func clearCliCache() {
|
static func clearCliCache() {
|
||||||
self.cliCache.set(nil)
|
self.cliCache.set(nil)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -14,9 +14,18 @@ actor VoiceWakeRuntime {
|
||||||
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
|
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
|
||||||
private var recognitionTask: SFSpeechRecognitionTask?
|
private var recognitionTask: SFSpeechRecognitionTask?
|
||||||
private var lastHeard: Date?
|
private var lastHeard: Date?
|
||||||
|
private var captureStartedAt: Date?
|
||||||
|
private var captureTask: Task<Void, Never>?
|
||||||
|
private var capturedTranscript: String = ""
|
||||||
|
private var isCapturing: Bool = false
|
||||||
private var cooldownUntil: Date?
|
private var cooldownUntil: Date?
|
||||||
private var currentConfig: RuntimeConfig?
|
private var currentConfig: RuntimeConfig?
|
||||||
|
|
||||||
|
// Tunables
|
||||||
|
private let silenceWindow: TimeInterval = 1.0
|
||||||
|
private let captureHardStop: TimeInterval = 8.0
|
||||||
|
private let debounceAfterSend: TimeInterval = 0.35
|
||||||
|
|
||||||
struct RuntimeConfig: Equatable {
|
struct RuntimeConfig: Equatable {
|
||||||
let triggers: [String]
|
let triggers: [String]
|
||||||
let micID: String?
|
let micID: String?
|
||||||
|
|
@ -95,6 +104,11 @@ actor VoiceWakeRuntime {
|
||||||
}
|
}
|
||||||
|
|
||||||
private func stop() {
|
private func stop() {
|
||||||
|
self.captureTask?.cancel()
|
||||||
|
self.captureTask = nil
|
||||||
|
self.isCapturing = false
|
||||||
|
self.capturedTranscript = ""
|
||||||
|
self.captureStartedAt = nil
|
||||||
self.recognitionTask?.cancel()
|
self.recognitionTask?.cancel()
|
||||||
self.recognitionTask = nil
|
self.recognitionTask = nil
|
||||||
self.recognitionRequest?.endAudio()
|
self.recognitionRequest?.endAudio()
|
||||||
|
|
@ -120,21 +134,22 @@ actor VoiceWakeRuntime {
|
||||||
}
|
}
|
||||||
|
|
||||||
guard let transcript else { return }
|
guard let transcript else { return }
|
||||||
if !transcript.isEmpty { self.lastHeard = Date() }
|
|
||||||
|
let now = Date()
|
||||||
|
if !transcript.isEmpty {
|
||||||
|
self.lastHeard = now
|
||||||
|
if self.isCapturing {
|
||||||
|
self.capturedTranscript = transcript
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.isCapturing { return }
|
||||||
|
|
||||||
if Self.matches(text: transcript, triggers: config.triggers) {
|
if Self.matches(text: transcript, triggers: config.triggers) {
|
||||||
let now = Date()
|
|
||||||
if let cooldown = cooldownUntil, now < cooldown {
|
if let cooldown = cooldownUntil, now < cooldown {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
self.cooldownUntil = now.addingTimeInterval(2.5)
|
await self.beginCapture(transcript: transcript, config: config)
|
||||||
await MainActor.run { AppStateStore.shared.triggerVoiceEars() }
|
|
||||||
let forwardConfig = await MainActor.run { AppStateStore.shared.voiceWakeForwardConfig }
|
|
||||||
if forwardConfig.enabled {
|
|
||||||
Task.detached {
|
|
||||||
await VoiceWakeForwarder.forward(transcript: transcript, config: forwardConfig)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -149,6 +164,77 @@ actor VoiceWakeRuntime {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private func beginCapture(transcript: String, config: RuntimeConfig) async {
|
||||||
|
self.isCapturing = true
|
||||||
|
self.capturedTranscript = transcript
|
||||||
|
self.captureStartedAt = Date()
|
||||||
|
self.cooldownUntil = nil
|
||||||
|
|
||||||
|
await MainActor.run { AppStateStore.shared.triggerVoiceEars(ttl: nil) }
|
||||||
|
|
||||||
|
self.captureTask?.cancel()
|
||||||
|
self.captureTask = Task { [weak self] in
|
||||||
|
guard let self else { return }
|
||||||
|
await self.monitorCapture(config: config)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private func monitorCapture(config: RuntimeConfig) async {
|
||||||
|
let start = self.captureStartedAt ?? Date()
|
||||||
|
let hardStop = start.addingTimeInterval(self.captureHardStop)
|
||||||
|
|
||||||
|
while self.isCapturing {
|
||||||
|
let now = Date()
|
||||||
|
if now >= hardStop {
|
||||||
|
await self.finalizeCapture(config: config)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if let last = self.lastHeard, now.timeIntervalSince(last) >= self.silenceWindow {
|
||||||
|
await self.finalizeCapture(config: config)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
try? await Task.sleep(nanoseconds: 200_000_000)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private func finalizeCapture(config: RuntimeConfig) async {
|
||||||
|
guard self.isCapturing else { return }
|
||||||
|
self.isCapturing = false
|
||||||
|
self.captureTask?.cancel()
|
||||||
|
self.captureTask = nil
|
||||||
|
|
||||||
|
let finalTranscript = self.capturedTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||||
|
self.capturedTranscript = ""
|
||||||
|
self.captureStartedAt = nil
|
||||||
|
self.lastHeard = nil
|
||||||
|
|
||||||
|
await MainActor.run { AppStateStore.shared.stopVoiceEars() }
|
||||||
|
|
||||||
|
if !finalTranscript.isEmpty {
|
||||||
|
await self.send(transcript: finalTranscript, config: config)
|
||||||
|
}
|
||||||
|
|
||||||
|
self.cooldownUntil = Date().addingTimeInterval(self.debounceAfterSend)
|
||||||
|
|
||||||
|
// Restart the recognizer so we listen for the next trigger with a clean buffer.
|
||||||
|
let current = self.currentConfig
|
||||||
|
self.stop()
|
||||||
|
if let current { await self.start(with: current) }
|
||||||
|
}
|
||||||
|
|
||||||
|
private func send(transcript: String, config: RuntimeConfig) async {
|
||||||
|
let forwardConfig = await MainActor.run { AppStateStore.shared.voiceWakeForwardConfig }
|
||||||
|
guard forwardConfig.enabled else { return }
|
||||||
|
|
||||||
|
let payload = VoiceWakeForwarder.prefixedTranscript(transcript)
|
||||||
|
|
||||||
|
Task.detached {
|
||||||
|
await VoiceWakeForwarder.forward(transcript: payload, config: forwardConfig)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#if DEBUG
|
#if DEBUG
|
||||||
static func _testMatches(text: String, triggers: [String]) -> Bool {
|
static func _testMatches(text: String, triggers: [String]) -> Bool {
|
||||||
self.matches(text: text, triggers: triggers)
|
self.matches(text: text, triggers: triggers)
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,7 @@ final class VoiceWakeTester {
|
||||||
private var holdingAfterDetect = false
|
private var holdingAfterDetect = false
|
||||||
private var detectedText: String?
|
private var detectedText: String?
|
||||||
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "voicewake")
|
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "voicewake")
|
||||||
|
private let silenceWindow: TimeInterval = 1.0
|
||||||
|
|
||||||
init(locale: Locale = .current) {
|
init(locale: Locale = .current) {
|
||||||
self.recognizer = SFSpeechRecognizer(locale: locale)
|
self.recognizer = SFSpeechRecognizer(locale: locale)
|
||||||
|
|
@ -132,10 +133,11 @@ final class VoiceWakeTester {
|
||||||
self.holdingAfterDetect = true
|
self.holdingAfterDetect = true
|
||||||
self.detectedText = text
|
self.detectedText = text
|
||||||
self.logger.info("voice wake detected; forwarding (len=\(text.count))")
|
self.logger.info("voice wake detected; forwarding (len=\(text.count))")
|
||||||
await MainActor.run { AppStateStore.shared.triggerVoiceEars() }
|
await MainActor.run { AppStateStore.shared.triggerVoiceEars(ttl: nil) }
|
||||||
let config = await MainActor.run { AppStateStore.shared.voiceWakeForwardConfig }
|
let config = await MainActor.run { AppStateStore.shared.voiceWakeForwardConfig }
|
||||||
Task.detached {
|
Task.detached {
|
||||||
await VoiceWakeForwarder.forward(transcript: text, config: config)
|
let payload = VoiceWakeForwarder.prefixedTranscript(text)
|
||||||
|
await VoiceWakeForwarder.forward(transcript: payload, config: config)
|
||||||
}
|
}
|
||||||
Task { @MainActor in onUpdate(.detected(text)) }
|
Task { @MainActor in onUpdate(.detected(text)) }
|
||||||
self.holdUntilSilence(onUpdate: onUpdate)
|
self.holdUntilSilence(onUpdate: onUpdate)
|
||||||
|
|
@ -162,8 +164,7 @@ final class VoiceWakeTester {
|
||||||
Task { [weak self] in
|
Task { [weak self] in
|
||||||
guard let self else { return }
|
guard let self else { return }
|
||||||
let detectedAt = Date()
|
let detectedAt = Date()
|
||||||
let hardStop = detectedAt.addingTimeInterval(3) // cap overall listen after trigger
|
let hardStop = detectedAt.addingTimeInterval(6) // cap overall listen after trigger
|
||||||
let silenceWindow: TimeInterval = 0.8
|
|
||||||
|
|
||||||
while !self.isStopping {
|
while !self.isStopping {
|
||||||
let now = Date()
|
let now = Date()
|
||||||
|
|
@ -175,6 +176,7 @@ final class VoiceWakeTester {
|
||||||
}
|
}
|
||||||
if !self.isStopping {
|
if !self.isStopping {
|
||||||
self.stop()
|
self.stop()
|
||||||
|
await MainActor.run { AppStateStore.shared.stopVoiceEars() }
|
||||||
if let detectedText {
|
if let detectedText {
|
||||||
self.logger.info("voice wake hold finished; len=\(detectedText.count)")
|
self.logger.info("voice wake hold finished; len=\(detectedText.count)")
|
||||||
Task { @MainActor in onUpdate(.detected(detectedText)) }
|
Task { @MainActor in onUpdate(.detected(detectedText)) }
|
||||||
|
|
|
||||||
|
|
@ -73,4 +73,13 @@ import Testing
|
||||||
let escapedQuote = VoiceWakeForwarder.shellEscape(textWithQuote)
|
let escapedQuote = VoiceWakeForwarder.shellEscape(textWithQuote)
|
||||||
#expect(escapedQuote == "'Debug test works (and a funny pun)'\\'''")
|
#expect(escapedQuote == "'Debug test works (and a funny pun)'\\'''")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test func prefixedTranscriptUsesMachineName() {
|
||||||
|
let transcript = "hello world"
|
||||||
|
let prefixed = VoiceWakeForwarder.prefixedTranscript(transcript, machineName: "My-Mac")
|
||||||
|
|
||||||
|
#expect(prefixed.starts(with: "User talked via voice recognition on"))
|
||||||
|
#expect(prefixed.contains("My-Mac"))
|
||||||
|
#expect(prefixed.hasSuffix("\n\nhello world"))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -4,11 +4,11 @@ Author: steipete · Updated: 2025-12-06 · Scope: macOS app (`apps/macos`)
|
||||||
|
|
||||||
- **Idle:** Normal icon animation (blink, occasional wiggle).
|
- **Idle:** Normal icon animation (blink, occasional wiggle).
|
||||||
- **Paused:** Status item uses `appearsDisabled`; no motion.
|
- **Paused:** Status item uses `appearsDisabled`; no motion.
|
||||||
- **Voice trigger (big ears):** Voice wake detector calls `AppState.triggerVoiceEars()` → `earBoostActive=true` for ~5s. Ears scale up (1.9x), get circular ear holes for readability, then auto-reset. Only fired from the in-app voice pipeline.
|
- **Voice trigger (big ears):** Voice wake detector calls `AppState.triggerVoiceEars(ttl: nil)` when the wake word is heard, keeping `earBoostActive=true` while the utterance is captured. Ears scale up (1.9x), get circular ear holes for readability, then drop via `stopVoiceEars()` after 1s of silence. Only fired from the in-app voice pipeline.
|
||||||
- **Working (agent running):** `AppState.isWorking=true` drives a “tail/leg scurry” micro-motion: faster leg wiggle and slight offset while work is in-flight. Currently toggled around WebChat agent runs; add the same toggle around other long tasks when you wire them.
|
- **Working (agent running):** `AppState.isWorking=true` drives a “tail/leg scurry” micro-motion: faster leg wiggle and slight offset while work is in-flight. Currently toggled around WebChat agent runs; add the same toggle around other long tasks when you wire them.
|
||||||
|
|
||||||
Wiring points
|
Wiring points
|
||||||
- Voice wake: see `VoiceWakeTester.handleResult` in `AppMain.swift`—on detection it calls `triggerVoiceEars()`.
|
- Voice wake: runtime/tester call `AppState.triggerVoiceEars(ttl: nil)` on trigger and `stopVoiceEars()` after 1s of silence to match the capture window.
|
||||||
- Agent activity: set `AppStateStore.shared.setWorking(true/false)` around work spans (already done in WebChat agent call). Keep spans short and reset in `defer` blocks to avoid stuck animations.
|
- Agent activity: set `AppStateStore.shared.setWorking(true/false)` around work spans (already done in WebChat agent call). Keep spans short and reset in `defer` blocks to avoid stuck animations.
|
||||||
|
|
||||||
Shapes & sizes
|
Shapes & sizes
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,28 @@
|
||||||
|
# Voice Wake Pipeline
|
||||||
|
|
||||||
|
Updated: 2025-12-08 · Owners: mac app
|
||||||
|
|
||||||
|
## Runtime behavior
|
||||||
|
- Always-on listener (Speech framework) waits for any trigger word.
|
||||||
|
- On first trigger hit: start capture, raise ears immediately via `AppState.triggerVoiceEars(ttl: nil)`, reset capture buffer.
|
||||||
|
- While capturing: keep buffer in sync with partial transcripts; update `lastHeard` whenever audio arrives.
|
||||||
|
- End capture when 1.0s of silence is observed (or 8s hard stop), then call `stopVoiceEars()`, prepend the voice-prefix string, send once to Claude, and restart the recognizer for a clean next trigger. A short 350ms debounce prevents double-fires.
|
||||||
|
|
||||||
|
## Visual states
|
||||||
|
- **Listening for trigger:** idle icon.
|
||||||
|
- **Wake word detected / capturing:** ears enlarged with holes; stays up until silence end, not a fixed timer.
|
||||||
|
- **After send:** ears drop immediately when silence window elapses; icon returns to idle.
|
||||||
|
|
||||||
|
## Forwarding payload
|
||||||
|
- Uses `VoiceWakeForwarder.prefixedTranscript(_:)` to prepend the model hint:
|
||||||
|
`User talked via voice recognition on <machine> - repeat prompt first + remember some words might be incorrectly transcribed.`
|
||||||
|
- Machine name resolves to Host.localizedName or hostName; caller can override for tests.
|
||||||
|
|
||||||
|
## Testing hooks
|
||||||
|
- Settings tester mirrors runtime: same capture/silence flow, same prefix, same ear behavior.
|
||||||
|
- Unit test: `VoiceWakeForwarderTests.prefixedTranscriptUsesMachineName` covers the prefix format.
|
||||||
|
|
||||||
|
## Tuning knobs (swift constants)
|
||||||
|
- Silence window: 1.0s (`silenceWindow` in `VoiceWakeRuntime`).
|
||||||
|
- Hard stop after trigger: 8s (`captureHardStop`).
|
||||||
|
- Post-send debounce: 0.35s (`debounceAfterSend`).
|
||||||
Loading…
Reference in New Issue