feat(mac): tokenized voice overlay adoption
parent
cfd2c41c21
commit
d084a37e11
|
|
@ -77,6 +77,8 @@ final class VoicePushToTalkHotkey {
|
||||||
actor VoicePushToTalk {
|
actor VoicePushToTalk {
|
||||||
static let shared = VoicePushToTalk()
|
static let shared = VoicePushToTalk()
|
||||||
|
|
||||||
|
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "voicewake.ptt")
|
||||||
|
|
||||||
private var recognizer: SFSpeechRecognizer?
|
private var recognizer: SFSpeechRecognizer?
|
||||||
private var audioEngine = AVAudioEngine()
|
private var audioEngine = AVAudioEngine()
|
||||||
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
|
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
|
||||||
|
|
@ -89,6 +91,8 @@ actor VoicePushToTalk {
|
||||||
private var triggerChimePlayed = false
|
private var triggerChimePlayed = false
|
||||||
private var finalized = false
|
private var finalized = false
|
||||||
private var timeoutTask: Task<Void, Never>?
|
private var timeoutTask: Task<Void, Never>?
|
||||||
|
private var overlayToken: UUID?
|
||||||
|
private var adoptedPrefix: String = ""
|
||||||
|
|
||||||
private struct Config {
|
private struct Config {
|
||||||
let micID: String?
|
let micID: String?
|
||||||
|
|
@ -112,14 +116,22 @@ actor VoicePushToTalk {
|
||||||
self.triggerChimePlayed = false
|
self.triggerChimePlayed = false
|
||||||
self.finalized = false
|
self.finalized = false
|
||||||
self.timeoutTask?.cancel(); self.timeoutTask = nil
|
self.timeoutTask?.cancel(); self.timeoutTask = nil
|
||||||
|
let snapshot = await MainActor.run { VoiceWakeOverlayController.shared.snapshot() }
|
||||||
|
self.adoptedPrefix = snapshot.isVisible ? snapshot.text.trimmingCharacters(in: .whitespacesAndNewlines) : ""
|
||||||
|
self.logger.info("ptt begin adopted_prefix_len=\(self.adoptedPrefix.count, privacy: .public)")
|
||||||
if config.triggerChime != .none {
|
if config.triggerChime != .none {
|
||||||
self.triggerChimePlayed = true
|
self.triggerChimePlayed = true
|
||||||
await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime) }
|
await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime, reason: "ptt.trigger") }
|
||||||
}
|
}
|
||||||
// Pause the always-on wake word recognizer so both pipelines don't fight over the mic tap.
|
// Pause the always-on wake word recognizer so both pipelines don't fight over the mic tap.
|
||||||
await VoiceWakeRuntime.shared.pauseForPushToTalk()
|
await VoiceWakeRuntime.shared.pauseForPushToTalk()
|
||||||
await MainActor.run {
|
let adoptedPrefix = self.adoptedPrefix
|
||||||
VoiceWakeOverlayController.shared.showPartial(transcript: "")
|
let adoptedAttributed: NSAttributedString? = adoptedPrefix.isEmpty ? nil : Self.makeAttributed(committed: adoptedPrefix, volatile: "", isFinal: false)
|
||||||
|
self.overlayToken = await MainActor.run {
|
||||||
|
VoiceWakeOverlayController.shared.startSession(
|
||||||
|
source: .pushToTalk,
|
||||||
|
transcript: adoptedPrefix,
|
||||||
|
attributed: adoptedAttributed)
|
||||||
}
|
}
|
||||||
|
|
||||||
do {
|
do {
|
||||||
|
|
@ -143,7 +155,7 @@ actor VoicePushToTalk {
|
||||||
// Give Speech a brief window to deliver the final result; otherwise fall back to current text.
|
// Give Speech a brief window to deliver the final result; otherwise fall back to current text.
|
||||||
self.timeoutTask?.cancel()
|
self.timeoutTask?.cancel()
|
||||||
self.timeoutTask = Task { [weak self] in
|
self.timeoutTask = Task { [weak self] in
|
||||||
try? await Task.sleep(nanoseconds: 700_000_000) // 700ms grace period
|
try? await Task.sleep(nanoseconds: 1_500_000_000) // 1.5s grace period to await final result
|
||||||
await self?.finalize(transcriptOverride: nil, reason: "timeout")
|
await self?.finalize(transcriptOverride: nil, reason: "timeout")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -175,8 +187,7 @@ actor VoicePushToTalk {
|
||||||
self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in
|
self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in
|
||||||
guard let self else { return }
|
guard let self else { return }
|
||||||
if let error {
|
if let error {
|
||||||
Logger(subsystem: "com.steipete.clawdis", category: "voicewake.ptt")
|
self.logger.debug("push-to-talk error: \(error.localizedDescription, privacy: .public)")
|
||||||
.debug("push-to-talk error: \(error.localizedDescription, privacy: .public)")
|
|
||||||
}
|
}
|
||||||
let transcript = result?.bestTranscription.formattedString
|
let transcript = result?.bestTranscription.formattedString
|
||||||
let isFinal = result?.isFinal ?? false
|
let isFinal = result?.isFinal ?? false
|
||||||
|
|
@ -200,10 +211,13 @@ actor VoicePushToTalk {
|
||||||
self.volatile = Self.delta(after: self.committed, current: transcript)
|
self.volatile = Self.delta(after: self.committed, current: transcript)
|
||||||
}
|
}
|
||||||
|
|
||||||
let snapshot = self.committed + self.volatile
|
let committedWithPrefix = Self.join(self.adoptedPrefix, self.committed)
|
||||||
let attributed = Self.makeAttributed(committed: self.committed, volatile: self.volatile, isFinal: isFinal)
|
let snapshot = Self.join(committedWithPrefix, self.volatile)
|
||||||
await MainActor.run {
|
let attributed = Self.makeAttributed(committed: committedWithPrefix, volatile: self.volatile, isFinal: isFinal)
|
||||||
VoiceWakeOverlayController.shared.showPartial(transcript: snapshot, attributed: attributed)
|
if let token = self.overlayToken {
|
||||||
|
await MainActor.run {
|
||||||
|
VoiceWakeOverlayController.shared.updatePartial(token: token, transcript: snapshot, attributed: attributed)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -212,14 +226,18 @@ actor VoicePushToTalk {
|
||||||
self.finalized = true
|
self.finalized = true
|
||||||
self.timeoutTask?.cancel(); self.timeoutTask = nil
|
self.timeoutTask?.cancel(); self.timeoutTask = nil
|
||||||
|
|
||||||
let finalText: String = {
|
let finalRecognized: String = {
|
||||||
if let override = transcriptOverride?.trimmingCharacters(in: .whitespacesAndNewlines) {
|
if let override = transcriptOverride?.trimmingCharacters(in: .whitespacesAndNewlines) {
|
||||||
return override
|
return override
|
||||||
}
|
}
|
||||||
return (self.committed + self.volatile).trimmingCharacters(in: .whitespacesAndNewlines)
|
return (self.committed + self.volatile).trimmingCharacters(in: .whitespacesAndNewlines)
|
||||||
}()
|
}()
|
||||||
|
let finalText = Self.join(self.adoptedPrefix, finalRecognized)
|
||||||
|
|
||||||
let attributed = Self.makeAttributed(committed: self.committed, volatile: self.volatile, isFinal: true)
|
let attributed = Self.makeAttributed(
|
||||||
|
committed: Self.join(self.adoptedPrefix, self.committed),
|
||||||
|
volatile: self.volatile,
|
||||||
|
isFinal: true)
|
||||||
let forward: VoiceWakeForwardConfig
|
let forward: VoiceWakeForwardConfig
|
||||||
if let cached = self.activeConfig?.forwardConfig {
|
if let cached = self.activeConfig?.forwardConfig {
|
||||||
forward = cached
|
forward = cached
|
||||||
|
|
@ -228,19 +246,28 @@ actor VoicePushToTalk {
|
||||||
}
|
}
|
||||||
let chime = finalText.isEmpty ? .none : (self.activeConfig?.sendChime ?? .none)
|
let chime = finalText.isEmpty ? .none : (self.activeConfig?.sendChime ?? .none)
|
||||||
|
|
||||||
|
let token = self.overlayToken
|
||||||
|
let logger = self.logger
|
||||||
await MainActor.run {
|
await MainActor.run {
|
||||||
Logger(subsystem: "com.steipete.clawdis", category: "voicewake.ptt")
|
logger.info("ptt finalize reason=\(reason, privacy: .public) len=\(finalText.count, privacy: .public)")
|
||||||
.info("ptt finalize reason=\(reason, privacy: .public) len=\(finalText.count, privacy: .public)")
|
|
||||||
if finalText.isEmpty {
|
if finalText.isEmpty {
|
||||||
VoiceWakeOverlayController.shared.dismiss(reason: .empty)
|
VoiceWakeOverlayController.shared.dismiss(token: token, reason: .empty)
|
||||||
} else {
|
} else if let token {
|
||||||
VoiceWakeOverlayController.shared.presentFinal(
|
VoiceWakeOverlayController.shared.presentFinal(
|
||||||
|
token: token,
|
||||||
transcript: finalText,
|
transcript: finalText,
|
||||||
forwardConfig: forward,
|
forwardConfig: forward,
|
||||||
autoSendAfter: nil,
|
autoSendAfter: nil,
|
||||||
sendChime: chime,
|
sendChime: chime,
|
||||||
attributed: attributed)
|
attributed: attributed)
|
||||||
VoiceWakeOverlayController.shared.sendNow(sendChime: chime)
|
VoiceWakeOverlayController.shared.sendNow(token: token, sendChime: chime)
|
||||||
|
} else {
|
||||||
|
if chime != .none {
|
||||||
|
VoiceWakeChimePlayer.play(chime, reason: "ptt.fallback_send")
|
||||||
|
}
|
||||||
|
Task.detached {
|
||||||
|
await VoiceWakeForwarder.forward(transcript: finalText, config: forward)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -254,6 +281,8 @@ actor VoicePushToTalk {
|
||||||
self.volatile = ""
|
self.volatile = ""
|
||||||
self.activeConfig = nil
|
self.activeConfig = nil
|
||||||
self.triggerChimePlayed = false
|
self.triggerChimePlayed = false
|
||||||
|
self.overlayToken = nil
|
||||||
|
self.adoptedPrefix = ""
|
||||||
|
|
||||||
// Resume the wake-word runtime after push-to-talk finishes.
|
// Resume the wake-word runtime after push-to-talk finishes.
|
||||||
await VoiceWakeRuntime.shared.applyPushToTalkCooldown()
|
await VoiceWakeRuntime.shared.applyPushToTalkCooldown()
|
||||||
|
|
@ -284,6 +313,12 @@ actor VoicePushToTalk {
|
||||||
return (committedColor, volatileColor)
|
return (committedColor, volatileColor)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static func join(_ prefix: String, _ suffix: String) -> String {
|
||||||
|
if prefix.isEmpty { return suffix }
|
||||||
|
if suffix.isEmpty { return prefix }
|
||||||
|
return "\(prefix) \(suffix)"
|
||||||
|
}
|
||||||
|
|
||||||
private static func delta(after committed: String, current: String) -> String {
|
private static func delta(after committed: String, current: String) -> String {
|
||||||
if current.hasPrefix(committed) {
|
if current.hasPrefix(committed) {
|
||||||
let start = current.index(current.startIndex, offsetBy: committed.count)
|
let start = current.index(current.startIndex, offsetBy: committed.count)
|
||||||
|
|
|
||||||
|
|
@ -44,9 +44,13 @@ enum VoiceWakeChimePlayer {
|
||||||
private static let logger = Logger(subsystem: "com.steipete.clawdis", category: "voicewake.chime")
|
private static let logger = Logger(subsystem: "com.steipete.clawdis", category: "voicewake.chime")
|
||||||
private static var lastSound: NSSound?
|
private static var lastSound: NSSound?
|
||||||
|
|
||||||
static func play(_ chime: VoiceWakeChime) {
|
static func play(_ chime: VoiceWakeChime, reason: String? = nil) {
|
||||||
guard let sound = self.sound(for: chime) else { return }
|
guard let sound = self.sound(for: chime) else { return }
|
||||||
self.logger.log(level: .info, "chime play")
|
if let reason {
|
||||||
|
self.logger.log(level: .info, "chime play reason=\(reason, privacy: .public)")
|
||||||
|
} else {
|
||||||
|
self.logger.log(level: .info, "chime play")
|
||||||
|
}
|
||||||
SoundEffectPlayer.play(sound)
|
SoundEffectPlayer.play(sound)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -32,6 +32,7 @@ actor VoiceWakeRuntime {
|
||||||
private var cooldownUntil: Date?
|
private var cooldownUntil: Date?
|
||||||
private var currentConfig: RuntimeConfig?
|
private var currentConfig: RuntimeConfig?
|
||||||
private var listeningState: ListeningState = .idle
|
private var listeningState: ListeningState = .idle
|
||||||
|
private var overlayToken: UUID?
|
||||||
|
|
||||||
// Tunables
|
// Tunables
|
||||||
// Silence threshold once we've captured user speech (post-trigger).
|
// Silence threshold once we've captured user speech (post-trigger).
|
||||||
|
|
@ -162,9 +163,11 @@ actor VoiceWakeRuntime {
|
||||||
self.listeningState = .idle
|
self.listeningState = .idle
|
||||||
self.logger.debug("voicewake runtime stopped")
|
self.logger.debug("voicewake runtime stopped")
|
||||||
|
|
||||||
|
let token = self.overlayToken
|
||||||
|
self.overlayToken = nil
|
||||||
guard dismissOverlay else { return }
|
guard dismissOverlay else { return }
|
||||||
Task { @MainActor in
|
Task { @MainActor in
|
||||||
VoiceWakeOverlayController.shared.dismiss()
|
VoiceWakeOverlayController.shared.dismiss(token: token)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -208,8 +211,10 @@ actor VoiceWakeRuntime {
|
||||||
volatile: self.volatileTranscript,
|
volatile: self.volatileTranscript,
|
||||||
isFinal: isFinal)
|
isFinal: isFinal)
|
||||||
let snapshot = self.committedTranscript + self.volatileTranscript
|
let snapshot = self.committedTranscript + self.volatileTranscript
|
||||||
await MainActor.run {
|
if let token = self.overlayToken {
|
||||||
VoiceWakeOverlayController.shared.showPartial(transcript: snapshot, attributed: attributed)
|
await MainActor.run {
|
||||||
|
VoiceWakeOverlayController.shared.updatePartial(token: token, transcript: snapshot, attributed: attributed)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -249,7 +254,7 @@ actor VoiceWakeRuntime {
|
||||||
|
|
||||||
if config.triggerChime != .none, !self.triggerChimePlayed {
|
if config.triggerChime != .none, !self.triggerChimePlayed {
|
||||||
self.triggerChimePlayed = true
|
self.triggerChimePlayed = true
|
||||||
await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime) }
|
await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime, reason: "voicewake.trigger") }
|
||||||
}
|
}
|
||||||
|
|
||||||
let snapshot = self.committedTranscript + self.volatileTranscript
|
let snapshot = self.committedTranscript + self.volatileTranscript
|
||||||
|
|
@ -257,8 +262,11 @@ actor VoiceWakeRuntime {
|
||||||
committed: self.committedTranscript,
|
committed: self.committedTranscript,
|
||||||
volatile: self.volatileTranscript,
|
volatile: self.volatileTranscript,
|
||||||
isFinal: false)
|
isFinal: false)
|
||||||
await MainActor.run {
|
self.overlayToken = await MainActor.run {
|
||||||
VoiceWakeOverlayController.shared.showPartial(transcript: snapshot, attributed: attributed)
|
VoiceWakeOverlayController.shared.startSession(
|
||||||
|
source: .wakeWord,
|
||||||
|
transcript: snapshot,
|
||||||
|
attributed: attributed)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Keep the "ears" boosted for the capture window so the status icon animates while recording.
|
// Keep the "ears" boosted for the capture window so the status icon animates while recording.
|
||||||
|
|
@ -309,7 +317,9 @@ actor VoiceWakeRuntime {
|
||||||
self.triggerChimePlayed = false
|
self.triggerChimePlayed = false
|
||||||
|
|
||||||
await MainActor.run { AppStateStore.shared.stopVoiceEars() }
|
await MainActor.run { AppStateStore.shared.stopVoiceEars() }
|
||||||
await MainActor.run { VoiceWakeOverlayController.shared.updateLevel(0) }
|
if let token = self.overlayToken {
|
||||||
|
await MainActor.run { VoiceWakeOverlayController.shared.updateLevel(token: token, 0) }
|
||||||
|
}
|
||||||
|
|
||||||
let forwardConfig = await MainActor.run { AppStateStore.shared.voiceWakeForwardConfig }
|
let forwardConfig = await MainActor.run { AppStateStore.shared.voiceWakeForwardConfig }
|
||||||
// Auto-send should fire as soon as the silence threshold is satisfied (2s after speech, 5s after trigger-only).
|
// Auto-send should fire as soon as the silence threshold is satisfied (2s after speech, 5s after trigger-only).
|
||||||
|
|
@ -320,14 +330,25 @@ actor VoiceWakeRuntime {
|
||||||
volatile: "",
|
volatile: "",
|
||||||
isFinal: true)
|
isFinal: true)
|
||||||
let sendChime = finalTranscript.isEmpty ? .none : config.sendChime
|
let sendChime = finalTranscript.isEmpty ? .none : config.sendChime
|
||||||
await MainActor.run {
|
if let token = self.overlayToken {
|
||||||
VoiceWakeOverlayController.shared.presentFinal(
|
await MainActor.run {
|
||||||
transcript: finalTranscript,
|
VoiceWakeOverlayController.shared.presentFinal(
|
||||||
|
token: token,
|
||||||
|
transcript: finalTranscript,
|
||||||
forwardConfig: forwardConfig,
|
forwardConfig: forwardConfig,
|
||||||
autoSendAfter: delay,
|
autoSendAfter: delay,
|
||||||
sendChime: sendChime,
|
sendChime: sendChime,
|
||||||
attributed: finalAttributed)
|
attributed: finalAttributed)
|
||||||
|
}
|
||||||
|
} else if forwardConfig.enabled, !finalTranscript.isEmpty {
|
||||||
|
if sendChime != .none {
|
||||||
|
await MainActor.run { VoiceWakeChimePlayer.play(sendChime, reason: "voicewake.send") }
|
||||||
|
}
|
||||||
|
Task.detached {
|
||||||
|
await VoiceWakeForwarder.forward(transcript: finalTranscript, config: forwardConfig)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
self.overlayToken = nil
|
||||||
|
|
||||||
self.cooldownUntil = Date().addingTimeInterval(self.debounceAfterSend)
|
self.cooldownUntil = Date().addingTimeInterval(self.debounceAfterSend)
|
||||||
self.restartRecognizer()
|
self.restartRecognizer()
|
||||||
|
|
@ -349,8 +370,10 @@ actor VoiceWakeRuntime {
|
||||||
|
|
||||||
// Normalize against the adaptive threshold so the UI meter stays roughly 0...1 across devices.
|
// Normalize against the adaptive threshold so the UI meter stays roughly 0...1 across devices.
|
||||||
let clamped = min(1.0, max(0.0, rms / max(self.minSpeechRMS, threshold)))
|
let clamped = min(1.0, max(0.0, rms / max(self.minSpeechRMS, threshold)))
|
||||||
Task { @MainActor in
|
if let token = self.overlayToken {
|
||||||
VoiceWakeOverlayController.shared.updateLevel(clamped)
|
Task { @MainActor in
|
||||||
|
VoiceWakeOverlayController.shared.updateLevel(token: token, clamped)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,12 @@ Audience: macOS app contributors. Goal: keep the voice overlay predictable when
|
||||||
- If the overlay is already visible from wake-word and the user presses the hotkey, the hotkey session *adopts* the existing text instead of resetting it. The overlay stays up while the hotkey is held. When the user releases: send if there is trimmed text, otherwise dismiss.
|
- If the overlay is already visible from wake-word and the user presses the hotkey, the hotkey session *adopts* the existing text instead of resetting it. The overlay stays up while the hotkey is held. When the user releases: send if there is trimmed text, otherwise dismiss.
|
||||||
- Wake-word alone still auto-sends on silence; push-to-talk sends immediately on release.
|
- Wake-word alone still auto-sends on silence; push-to-talk sends immediately on release.
|
||||||
|
|
||||||
### Proposed architecture (to implement next)
|
### Implemented (Dec 9, 2025)
|
||||||
|
- Overlay sessions now carry a token per capture (wake-word or push-to-talk). Partial/final/send/dismiss/level updates are dropped when the token doesn’t match, avoiding stale callbacks.
|
||||||
|
- Push-to-talk adopts any visible overlay text as a prefix (so pressing the hotkey while the wake overlay is up keeps the text and appends new speech). It waits up to 1.5s for a final transcript before falling back to the current text.
|
||||||
|
- Chime/overlay logging is emitted at `info` in categories `voicewake.overlay`, `voicewake.ptt`, and `voicewake.chime` (session start, partial, final, send, dismiss, chime reason).
|
||||||
|
|
||||||
|
### Next steps
|
||||||
1. **VoiceSessionCoordinator (actor)**
|
1. **VoiceSessionCoordinator (actor)**
|
||||||
- Owns exactly one `VoiceSession` at a time.
|
- Owns exactly one `VoiceSession` at a time.
|
||||||
- API (token-based): `beginWakeCapture`, `beginPushToTalk`, `updatePartial`, `endCapture`, `cancel`, `applyCooldown`.
|
- API (token-based): `beginWakeCapture`, `beginPushToTalk`, `updatePartial`, `endCapture`, `cancel`, `applyCooldown`.
|
||||||
|
|
@ -40,4 +45,3 @@ Audience: macOS app contributors. Goal: keep the voice overlay predictable when
|
||||||
3. Refactor `VoicePushToTalk` to adopt existing sessions and call `endCapture` on release; apply runtime cooldown.
|
3. Refactor `VoicePushToTalk` to adopt existing sessions and call `endCapture` on release; apply runtime cooldown.
|
||||||
4. Wire `VoiceWakeOverlayController` to the publisher; remove direct calls from runtime/PTT.
|
4. Wire `VoiceWakeOverlayController` to the publisher; remove direct calls from runtime/PTT.
|
||||||
5. Add integration tests for session adoption, cooldown, and empty-text dismissal.
|
5. Add integration tests for session adoption, cooldown, and empty-text dismissal.
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue