Skip to content

Commit b4270c5

Browse files
authored
Merge pull request #2 from argmaxinc/chen/OSSUpdate
Remove tracking, add background recording and stream mode config
2 parents ff7aac2 + d99e0f1 commit b4270c5

File tree

7 files changed

+148
-81
lines changed

7 files changed

+148
-81
lines changed

Playground.xcodeproj/project.pbxproj

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
74F3B7C12E1CF4F400C544D1 /* AudioProcess.swift in Sources */ = {isa = PBXBuildFile; fileRef = 74F3B7C02E1CF4F400C544D1 /* AudioProcess.swift */; };
3030
74F860942E29A9D20007163C /* ProcessTapper.swift in Sources */ = {isa = PBXBuildFile; fileRef = 74F860932E29A9D20007163C /* ProcessTapper.swift */; };
3131
74F860962E2B19060007163C /* CoreAudioUtils.swift in Sources */ = {isa = PBXBuildFile; fileRef = 74F860952E2B19060007163C /* CoreAudioUtils.swift */; };
32+
74F897792E4F9B130045252E /* TranscriptionModeSelection.swift in Sources */ = {isa = PBXBuildFile; fileRef = 74F897782E4F9B130045252E /* TranscriptionModeSelection.swift */; };
3233
/* End PBXBuildFile section */
3334

3435
/* Begin PBXCopyFilesBuildPhase section */
@@ -79,6 +80,7 @@
7980
74F3B7C02E1CF4F400C544D1 /* AudioProcess.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AudioProcess.swift; sourceTree = "<group>"; };
8081
74F860932E29A9D20007163C /* ProcessTapper.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ProcessTapper.swift; sourceTree = "<group>"; };
8182
74F860952E2B19060007163C /* CoreAudioUtils.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CoreAudioUtils.swift; sourceTree = "<group>"; };
83+
74F897782E4F9B130045252E /* TranscriptionModeSelection.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TranscriptionModeSelection.swift; sourceTree = "<group>"; };
8284
/* End PBXFileReference section */
8385

8486
/* Begin PBXFrameworksBuildPhase section */
@@ -130,6 +132,7 @@
130132
1677AFE42B5769E5008C61C0 /* Views */ = {
131133
isa = PBXGroup;
132134
children = (
135+
74F897782E4F9B130045252E /* TranscriptionModeSelection.swift */,
133136
74312CDD2E1DA46C000D994A /* StreamResultView.swift */,
134137
1677AFE52B57704E008C61C0 /* ContentView.swift */,
135138
74F3B7BB2E1C7C8B00C544D1 /* ToastMessage.swift */,
@@ -292,6 +295,7 @@
292295
746E4C062E39874F009623D7 /* DefaultEnvInitializer.swift in Sources */,
293296
1677AFC22B57618A008C61C0 /* Playground.swift in Sources */,
294297
748BA5502E1B2EC6008DA1B8 /* StreamViewModel.swift in Sources */,
298+
74F897792E4F9B130045252E /* TranscriptionModeSelection.swift in Sources */,
295299
746E4C0A2E398757009623D7 /* PlaygroundEnvInitializer.swift in Sources */,
296300
74F3B7BC2E1C7C8B00C544D1 /* ToastMessage.swift in Sources */,
297301
74312CDE2E1DA46C000D994A /* StreamResultView.swift in Sources */,

Playground/Info.plist

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,9 @@
99
<key>NSPrivacyAccessedAPIType</key>
1010
<string>NSPrivacyAccessedAPICategoryUserDefaults</string>
1111
</dict>
12+
<key>UIBackgroundModes</key>
13+
<array>
14+
<string>audio</string>
15+
</array>
1216
</dict>
1317
</plist>

Playground/ViewModels/StreamViewModel.swift

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ class StreamViewModel: ObservableObject {
5050
let sdkCoordinator: ArgmaxSDKCoordinator
5151

5252
private var streamTasks: [Task<Void, Never>] = []
53+
// Throttle guards to avoid overwhelming the UI with high-frequency updates
54+
private var lastEnergyUpdateAt: TimeInterval = 0
55+
private var lastHypothesisUpdateAtBySource: [String: TimeInterval] = [:]
5356

5457
// Currently active streaming sources, set only in startTranscribing
5558
private var curActiveStreamSrcs: [any StreamSourceProtocol] = []
@@ -282,9 +285,16 @@ class StreamViewModel: ObservableObject {
282285
private func handleResult(_ result: LiveResult, for sourceId: String) {
283286
switch result {
284287
case .hypothesis(let text, _):
288+
let now = Date().timeIntervalSince1970
289+
let last = lastHypothesisUpdateAtBySource[sourceId] ?? 0
290+
// Update at most 10 times per second per source
291+
guard now - last >= 0.1 else { return }
292+
lastHypothesisUpdateAtBySource[sourceId] = now
293+
let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
294+
guard trimmed != (isDeviceSource(sourceId) ? deviceResult?.hypothesisText : systemResult?.hypothesisText) else { return }
285295
updateStreamResult(sourceId: sourceId) { oldResult in
286296
var newResult = oldResult
287-
newResult.hypothesisText = text.trimmingCharacters(in: .whitespacesAndNewlines)
297+
newResult.hypothesisText = trimmed
288298
return newResult
289299
}
290300

@@ -311,10 +321,23 @@ class StreamViewModel: ObservableObject {
311321
@MainActor
312322
private func updateAudioMetrics(for source: ArgmaxSource, audioData: [Float]) {
313323
if case .device = source.streamType, let whisperKitPro = self.sdkCoordinator.whisperKit {
324+
let now = Date().timeIntervalSince1970
325+
guard now - lastEnergyUpdateAt >= 0.1 else { return }
326+
lastEnergyUpdateAt = now
327+
328+
// Limit the amount of energy samples passed to the UI for performance
329+
let energies = whisperKitPro.audioProcessor.relativeEnergy
330+
#if os(iOS)
331+
let newBufferEnergy = Array(energies.suffix(256))
332+
#else
333+
let newBufferEnergy = energies
334+
#endif
335+
let sampleCount = whisperKitPro.audioProcessor.audioSamples.count
336+
314337
updateStreamResult(sourceId: source.id) { oldResult in
315338
var newResult = oldResult
316-
newResult.bufferEnergy = whisperKitPro.audioProcessor.relativeEnergy
317-
newResult.bufferSeconds = Double(whisperKitPro.audioProcessor.audioSamples.count) / Double(WhisperKit.sampleRate)
339+
newResult.bufferEnergy = newBufferEnergy
340+
newResult.bufferSeconds = Double(sampleCount) / Double(WhisperKit.sampleRate)
318341
return newResult
319342
}
320343
}

Playground/Views/ContentView.swift

Lines changed: 74 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ import Hub
2727
///
2828
/// The view integrates with several key components:
2929
/// - `StreamViewModel`: Manages real-time audio streaming and transcription
30-
/// - `TranscribeViewModel`: Handles file-based transcription and recording workflows
30+
/// - `TranscribeViewModel`: Handles file-based transcription and recording workflows
3131
/// - `ArgmaxSDKCoordinator`: Coordinates access to WhisperKit and SpeakerKit instances
3232
/// - Audio discovery services for device and process selection (macOS)
3333
///
@@ -75,6 +75,8 @@ struct ContentView: View {
7575
@AppStorage("silenceThreshold") private var silenceThreshold: Double = 0.2
7676
@AppStorage("maxSilenceBufferLength") private var maxSilenceBufferLength: Double = 10.0
7777
@AppStorage("transcribeInterval") private var transcribeInterval: Double = 0.1
78+
@AppStorage("minProcessInterval") private var minProcessInterval: Double = 0.0
79+
@AppStorage("transcriptionMode") private var transcriptionModeRawValue: String = TranscriptionModeSelection.voiceTriggered.rawValue
7880
@AppStorage("useVAD") private var useVAD: Bool = true
7981
@AppStorage("tokenConfirmationsNeeded") private var tokenConfirmationsNeeded: Double = 2
8082
@AppStorage("concurrentWorkerCount") private var concurrentWorkerCount: Double = 4
@@ -91,6 +93,16 @@ struct ContentView: View {
9193
@AppStorage("fastLoadDecoderComputeUnits") private var fastLoadDecoderComputeUnits: MLComputeUnits = .cpuAndNeuralEngine
9294
#endif
9395
@AppStorage("trackingPermissionStatePro") private var trackingPermissionStateRawValue: Int = TrackingPermissionState.undetermined.rawValue
96+
97+
/// Computed property to work with transcription mode as an enum
98+
private var transcriptionMode: TranscriptionModeSelection {
99+
get {
100+
TranscriptionModeSelection(rawValue: transcriptionModeRawValue) ?? .voiceTriggered
101+
}
102+
set {
103+
transcriptionModeRawValue = newValue.rawValue
104+
}
105+
}
94106

95107
// MARK: Standard properties
96108

@@ -139,7 +151,6 @@ struct ContentView: View {
139151

140152
// MARK: Alerts
141153

142-
@State private var showReportingAlert = false
143154
@State private var showShortAudioWarningAlert: Bool = false
144155
@State private var showPermissionAlert: Bool = false
145156
@State private var permissionAlertMessage: String = ""
@@ -184,18 +195,6 @@ struct ContentView: View {
184195
set: { newValue in
185196
trackingPermissionStateRawValue = newValue ? TrackingPermissionState.granted.rawValue : TrackingPermissionState.denied.rawValue
186197
Logging.debug(newValue)
187-
188-
if newValue {
189-
sdkCoordinator.setupArgmax()
190-
analyticsLogger.configureIfNeeded()
191-
} else {
192-
Task {
193-
if await ArgmaxSDK.enabled() {
194-
await ArgmaxSDK.close()
195-
}
196-
Logging.debug("Shutting down ArgmaxSDK")
197-
}
198-
}
199198
}
200199
)
201200
}
@@ -348,18 +347,6 @@ struct ContentView: View {
348347
#endif
349348
.navigationSplitViewColumnWidth(min: 300, ideal: 350)
350349
.padding(.horizontal)
351-
.alert(isPresented: $showReportingAlert) {
352-
Alert(
353-
title: Text("Performance Reporting"),
354-
message: Text("Help us catch bugs early and improve reliability by enabling reporting and performance monitoring. Required to enable experimental features. Learn more at [argmaxinc.com/privacy](https://www.argmaxinc.com/privacy)"),
355-
primaryButton: .default(Text("Enable reporting")) {
356-
updateTracking(state: .granted)
357-
},
358-
secondaryButton: .cancel(Text("Opt Out")) {
359-
updateTracking(state: .denied)
360-
}
361-
)
362-
}
363350
} detail: {
364351
VStack {
365352
#if os(iOS)
@@ -448,12 +435,6 @@ struct ContentView: View {
448435
showWhisperKitComputeUnits = true
449436
speakerKitComputeUnitsExpanded = false
450437

451-
showReportingAlert = (trackingPermissionStateRawValue == 0) // undetermined
452-
if trackingPermissionStateRawValue == TrackingPermissionState.granted.rawValue {
453-
sdkCoordinator.setupArgmax()
454-
analyticsLogger.configureIfNeeded()
455-
}
456-
457438
// Check if Pro models are supported on this OS version
458439
if #unavailable(macOS 15, iOS 18, watchOS 11, visionOS 2) {
459440
showOSVersionAlert = true
@@ -1425,27 +1406,59 @@ struct ContentView: View {
14251406
}
14261407
.padding(.horizontal)
14271408

1428-
VStack {
1429-
Text("Silence Threshold")
1409+
Section(header: Text("Stream Mode Settings")) {
14301410
HStack {
1431-
Slider(value: $silenceThreshold, in: 0...1, step: 0.05)
1432-
Text(silenceThreshold.formatted(.number))
1433-
.frame(width: 30)
1434-
InfoButton("Relative silence threshold for the audio. \n Baseline is set by the quietest 100ms in the previous 2 seconds.")
1411+
Picker("Mode", selection: Binding(
1412+
get: { TranscriptionModeSelection(rawValue: transcriptionModeRawValue) ?? .voiceTriggered },
1413+
set: { transcriptionModeRawValue = $0.rawValue }
1414+
)) {
1415+
ForEach(TranscriptionModeSelection.allCases) { mode in
1416+
Text(mode.displayName).tag(mode)
1417+
}
1418+
}
1419+
.pickerStyle(MenuPickerStyle())
1420+
Spacer()
1421+
InfoButton(transcriptionMode.description)
14351422
}
1436-
}
1437-
.padding(.horizontal)
1438-
1439-
VStack {
1440-
Text("Max Silence Buffer Size")
1441-
HStack {
1442-
Slider(value: $maxSilenceBufferLength, in: 10...60, step: 1)
1443-
Text(maxSilenceBufferLength.formatted(.number))
1444-
.frame(width: 30)
1445-
InfoButton("Seconds of silence to buffer before audio is sent for transcription.")
1423+
1424+
if transcriptionMode == .voiceTriggered {
1425+
VStack {
1426+
Text("Silence Threshold")
1427+
HStack {
1428+
Slider(value: $silenceThreshold, in: 0...1, step: 0.05)
1429+
Text(silenceThreshold.formatted(.number.precision(.fractionLength(1))))
1430+
.frame(width: 30)
1431+
.lineLimit(1)
1432+
InfoButton("Relative silence threshold for the audio. \n Baseline is set by the quietest 100ms in the previous 2 seconds.")
1433+
}
1434+
}
1435+
.padding(.horizontal)
1436+
1437+
VStack {
1438+
Text("Max Silence Buffer Size")
1439+
HStack {
1440+
Slider(value: $maxSilenceBufferLength, in: 10...60, step: 1)
1441+
Text(maxSilenceBufferLength.formatted(.number.precision(.fractionLength(0))))
1442+
.frame(width: 30)
1443+
.lineLimit(1)
1444+
InfoButton("Seconds of silence to buffer before audio is sent for transcription.")
1445+
}
1446+
}
1447+
.padding(.horizontal)
1448+
1449+
VStack {
1450+
Text("Min Process Interval")
1451+
HStack {
1452+
Slider(value: $minProcessInterval, in: 0...15, step: 1)
1453+
Text(minProcessInterval.formatted(.number.precision(.fractionLength(0))))
1454+
.frame(width: 30)
1455+
.lineLimit(1)
1456+
InfoButton("Minimum interval the incoming stream data is fed to transcription pipeline.")
1457+
}
1458+
}
1459+
.padding(.horizontal)
14461460
}
14471461
}
1448-
.padding(.horizontal)
14491462

14501463
VStack {
14511464
Text("Transcribe Interval")
@@ -1458,21 +1471,6 @@ struct ContentView: View {
14581471
}
14591472
.padding(.horizontal)
14601473

1461-
Section(header: Text("Performance Reporting")) {
1462-
VStack(alignment: .leading) {
1463-
HStack {
1464-
Text("Enable Reporting")
1465-
InfoButton("Help us catch bugs early and improve reliability by enabling reporting and performance monitoring.")
1466-
Spacer()
1467-
Toggle("", isOn: trackingPermissionBinding)
1468-
}
1469-
Link(destination: URL(string: "https://www.argmaxinc.com/privacy")!) {
1470-
Text("Learn more at argmaxinc.com/privacy")
1471-
}
1472-
}
1473-
.padding(.horizontal)
1474-
.padding(.top)
1475-
}
14761474
Section(header: Text("Diarization Settings")) {
14771475
HStack {
14781476
Picker("Diarization", selection: $diarizationMode) {
@@ -2074,11 +2072,21 @@ struct ContentView: View {
20742072
isRecording = true
20752073
}
20762074

2075+
let streamMode: StreamTranscriptionMode
2076+
switch transcriptionMode {
2077+
case .alwaysOn:
2078+
streamMode = .alwaysOn
2079+
case .voiceTriggered:
2080+
streamMode = .voiceTriggered(silenceThreshold: Float(silenceThreshold), maxBufferLength: Float(maxSilenceBufferLength), minProcessInterval: Float(minProcessInterval))
2081+
case .batteryOptimized:
2082+
streamMode = .batteryOptimized
2083+
}
2084+
20772085
try await streamViewModel.startTranscribing(
20782086
options: DecodingOptionsPro(
20792087
base: decodingOptions,
20802088
transcribeInterval: transcribeInterval,
2081-
streamTranscriptionMode: .voiceTriggered(silenceThreshold: Float(silenceThreshold), maxBufferLength: Float(maxSilenceBufferLength))
2089+
streamTranscriptionMode: streamMode
20822090
)
20832091
)
20842092
} catch {
@@ -2188,6 +2196,7 @@ struct ContentView: View {
21882196
"compression_check_window": "\(compressionCheckWindow)",
21892197
"sample_length": "\(sampleLength)",
21902198
"silence_threshold": "\(silenceThreshold)",
2199+
"transcription_mode": "\(transcriptionMode.rawValue)",
21912200
"use_vad": "\(useVAD)",
21922201
"token_confirmations_needed": "\(tokenConfirmationsNeeded)",
21932202
"chunking_strategy": "\(chunkingStrategy)",

Playground/Views/StreamResultView.swift

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,14 +53,13 @@ struct StreamResultLine: View {
5353
.id("bottom")
5454
}
5555
.onChange(of: result.confirmedText) {
56-
withAnimation(.easeOut(duration: 0.3)) {
56+
withAnimation(.easeOut(duration: 0.15)) {
5757
proxy.scrollTo("bottom", anchor: .bottom)
5858
}
5959
}
60+
// Avoid animating on every hypothesis token; keep scroll position but don't animate
6061
.onChange(of: result.hypothesisText) {
61-
withAnimation(.easeOut(duration: 0.3)) {
62-
proxy.scrollTo("bottom", anchor: .bottom)
63-
}
62+
proxy.scrollTo("bottom", anchor: .bottom)
6463
}
6564
}
6665
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
/// Enumeration representing the available transcription modes for stream processing.
2+
enum TranscriptionModeSelection: String, CaseIterable, Identifiable {
3+
case alwaysOn = "alwaysOn"
4+
case voiceTriggered = "voiceTriggered"
5+
case batteryOptimized = "batteryOptimized"
6+
7+
var id: String { rawValue }
8+
9+
var displayName: String {
10+
switch self {
11+
case .alwaysOn:
12+
return "Always-On"
13+
case .voiceTriggered:
14+
return "Voice-Triggered"
15+
case .batteryOptimized:
16+
return "Battery-Optimized"
17+
}
18+
}
19+
20+
var description: String {
21+
switch self {
22+
case .alwaysOn:
23+
return "Continuous real-time transcription with lowest latency. Uses more system resources."
24+
case .voiceTriggered:
25+
return "Processes only audio above energy threshold. Conserves battery while staying responsive."
26+
case .batteryOptimized:
27+
return "Intelligent streaming with dynamic optimizations for maximum battery life."
28+
}
29+
}
30+
}

Playground/Views/VoiceEnergyView.swift

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import Foundation
22
import SwiftUI
33

4-
/// A SwiftUI view that visualizes audio buffer energy levels with threshold-based color coding.
4+
/// A SwiftUI view that visualizes audio buffer energy levels with threshold-based color coding.
55
/// This component provides real-time visual feedback for audio input levels and voice activity detection.
66
///
77
/// ## Features
@@ -28,14 +28,12 @@ struct VoiceEnergyView: View {
2828

2929
var body: some View {
3030
ScrollView(.horizontal) {
31-
HStack(spacing: 1) {
32-
ForEach(Array(bufferEnergy.enumerated())[0...], id: \.element) { _, energy in
33-
ZStack {
34-
RoundedRectangle(cornerRadius: 2)
35-
.frame(width: 2, height: CGFloat(energy) * 24)
36-
}
37-
.frame(maxHeight: 24)
38-
.background(energy > Float(silenceThreshold) ? Color.green.opacity(0.2) : Color.red.opacity(0.2))
31+
LazyHStack(spacing: 1) {
32+
ForEach(Array(bufferEnergy.enumerated()), id: \.offset) { _, energy in
33+
RoundedRectangle(cornerRadius: 2)
34+
.frame(width: 2, height: max(0, min(CGFloat(energy), 1)) * 24)
35+
.frame(maxHeight: 24)
36+
.background(energy > Float(silenceThreshold) ? Color.green.opacity(0.2) : Color.red.opacity(0.2))
3937
}
4038
}
4139
}

0 commit comments

Comments
 (0)