This guide uses the direct WebSocket approach — connecting straight to
wss://api.datagrid.com/ws/voice. You can also use the REST endpoint (POST /v1/voice) to get a pre-built WebSocket URL and start message first.Overview
The integration has four parts:- WebSocket connection — Connect to the voice endpoint and manage the message protocol
- Audio capture — Record microphone input as 16-bit mono PCM at 16kHz
- Audio playback — Play the agent’s response audio (16-bit mono PCM at 24kHz)
- UI — A simple button to start/stop the conversation
1. WebSocket Client
Create a class that manages the WebSocket connection and message routing:import Foundation
protocol VoiceSessionDelegate: AnyObject {
func voiceSessionDidConnect(_ session: VoiceSession)
func voiceSession(_ session: VoiceSession, didReceiveAudio base64Audio: String)
func voiceSessionDidBecomeReady(_ session: VoiceSession)
func voiceSession(_ session: VoiceSession, didStartSession sessionId: String, conversationId: String)
func voiceSession(_ session: VoiceSession, didReceiveToolCall toolName: String, status: String)
func voiceSessionWasInterrupted(_ session: VoiceSession)
func voiceSession(_ session: VoiceSession, didEnd payload: [String: Any])
func voiceSession(_ session: VoiceSession, didReceiveError message: String)
}
class VoiceSession: NSObject, URLSessionWebSocketDelegate {
weak var delegate: VoiceSessionDelegate?
private var webSocket: URLSessionWebSocketTask?
private var urlSession: URLSession?
private let apiKey: String
private let baseURL: String
init(apiKey: String, baseURL: String = "wss://api.datagrid.com") {
self.apiKey = apiKey
self.baseURL = baseURL
super.init()
}
// MARK: - Connection
func connect() {
let urlString = "\(baseURL)/ws/voice?token=\(apiKey)"
guard let url = URL(string: urlString) else { return }
urlSession = URLSession(
configuration: .default,
delegate: self,
delegateQueue: .main
)
webSocket = urlSession?.webSocketTask(with: url)
webSocket?.resume()
listenForMessages()
}
func disconnect() {
webSocket?.cancel(with: .goingAway, reason: nil)
webSocket = nil
}
// MARK: - URLSessionWebSocketDelegate
func urlSession(
_ session: URLSession,
webSocketTask: URLSessionWebSocketTask,
didOpenWithProtocol protocol: String?
) {
delegate?.voiceSessionDidConnect(self)
}
// MARK: - Sending Messages
func startSession(agentId: String? = nil, conversationId: String? = nil) {
var payload: [String: Any] = [:]
if let agentId { payload["agent_id"] = agentId }
if let conversationId { payload["conversation_id"] = conversationId }
send(type: "start", payload: payload)
}
func sendAudio(base64PCM: String) {
send(type: "audio", payload: ["data": base64PCM])
}
func stop() {
send(type: "stop")
}
func interrupt() {
send(type: "interrupt")
}
// MARK: - Private
private func send(type: String, payload: [String: Any]? = nil) {
var message: [String: Any] = ["type": type]
if let payload { message["payload"] = payload }
guard let data = try? JSONSerialization.data(withJSONObject: message),
let string = String(data: data, encoding: .utf8) else { return }
webSocket?.send(.string(string)) { error in
if let error {
print("[VoiceSession] Send error: \(error.localizedDescription)")
}
}
}
private func listenForMessages() {
webSocket?.receive { [weak self] result in
guard let self else { return }
switch result {
case .success(let message):
switch message {
case .string(let text):
self.handleMessage(text)
case .data(let data):
if let text = String(data: data, encoding: .utf8) {
self.handleMessage(text)
}
@unknown default:
break
}
// Continue listening
self.listenForMessages()
case .failure(let error):
print("[VoiceSession] Receive error: \(error.localizedDescription)")
}
}
}
private func handleMessage(_ text: String) {
guard let data = text.data(using: .utf8),
let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
let type = json["type"] as? String else { return }
let payload = json["payload"] as? [String: Any]
switch type {
case "started":
let sessionId = payload?["session_id"] as? String ?? ""
let conversationId = payload?["conversation_id"] as? String ?? ""
delegate?.voiceSession(self, didStartSession: sessionId, conversationId: conversationId)
case "ready":
delegate?.voiceSessionDidBecomeReady(self)
case "audio":
if let audioData = payload?["data"] as? String {
delegate?.voiceSession(self, didReceiveAudio: audioData)
}
case "tool_call":
let toolName = payload?["tool_name"] as? String ?? ""
let status = payload?["status"] as? String ?? ""
delegate?.voiceSession(self, didReceiveToolCall: toolName, status: status)
case "interrupted":
delegate?.voiceSessionWasInterrupted(self)
case "error":
let message = payload?["message"] as? String ?? "Unknown error"
delegate?.voiceSession(self, didReceiveError: message)
case "ended":
delegate?.voiceSession(self, didEnd: payload ?? [:])
default:
print("[VoiceSession] Unknown message type: \(type)")
}
}
}
2. Audio Capture (Microphone)
UseAVAudioEngine to capture microphone audio and convert it to 16-bit PCM:
import AVFoundation
class AudioCapture {
private let audioEngine = AVAudioEngine()
var onAudioCaptured: ((String) -> Void)? // base64 PCM callback
func start() throws {
let inputNode = audioEngine.inputNode
let recordingFormat = AVAudioFormat(
commonFormat: .pcmFormatInt16,
sampleRate: 16000,
channels: 1,
interleaved: true
)!
// Install a tap to capture audio buffers
inputNode.installTap(
onBus: 0,
bufferSize: 4096,
format: recordingFormat
) { [weak self] buffer, _ in
guard let self,
let channelData = buffer.int16ChannelData else { return }
let frameCount = Int(buffer.frameLength)
let data = Data(
bytes: channelData.pointee,
count: frameCount * MemoryLayout<Int16>.size
)
let base64 = data.base64EncodedString()
self.onAudioCaptured?(base64)
}
try audioEngine.start()
}
func stop() {
audioEngine.inputNode.removeTap(onBus: 0)
audioEngine.stop()
}
}
3. Audio Playback
UseAVAudioPlayerNode to play back the agent’s PCM audio in real time:
import AVFoundation
class AudioPlayer {
private let audioEngine = AVAudioEngine()
private let playerNode = AVAudioPlayerNode()
private let playbackFormat: AVAudioFormat
init() {
// Agent audio is 16-bit mono PCM at 24kHz
playbackFormat = AVAudioFormat(
commonFormat: .pcmFormatInt16,
sampleRate: 24000,
channels: 1,
interleaved: true
)!
audioEngine.attach(playerNode)
audioEngine.connect(playerNode, to: audioEngine.mainMixerNode, format: playbackFormat)
try? audioEngine.start()
playerNode.play()
}
func enqueue(base64Audio: String) {
guard let audioData = Data(base64Encoded: base64Audio) else { return }
let frameCount = UInt32(audioData.count / MemoryLayout<Int16>.size)
guard let buffer = AVAudioPCMBuffer(
pcmFormat: playbackFormat,
frameCapacity: frameCount
) else { return }
buffer.frameLength = frameCount
audioData.withUnsafeBytes { rawBuffer in
if let src = rawBuffer.baseAddress {
memcpy(buffer.int16ChannelData!.pointee, src, audioData.count)
}
}
playerNode.scheduleBuffer(buffer)
}
func stop() {
playerNode.stop()
audioEngine.stop()
}
}
4. Putting It All Together
Here’s a SwiftUI view that ties everything together:import SwiftUI
import AVFoundation
struct VoiceAssistantView: View {
@StateObject private var viewModel = VoiceAssistantViewModel()
var body: some View {
VStack(spacing: 24) {
Text(viewModel.statusText)
.font(.headline)
.foregroundColor(.secondary)
Button(action: { viewModel.toggleVoice() }) {
Image(systemName: viewModel.isActive ? "mic.fill" : "mic")
.font(.system(size: 48))
.foregroundColor(viewModel.isActive ? .red : .blue)
}
.padding()
if !viewModel.transcript.isEmpty {
ScrollView {
LazyVStack(alignment: .leading, spacing: 8) {
ForEach(viewModel.transcript, id: \.self) { line in
Text(line)
.font(.body)
}
}
.padding()
}
}
}
.padding()
}
}
@MainActor
class VoiceAssistantViewModel: ObservableObject {
@Published var isActive = false
@Published var statusText = "Tap the mic to start"
@Published var transcript: [String] = []
private var voiceSession: VoiceSession?
private var audioCapture: AudioCapture?
private var audioPlayer: AudioPlayer?
func toggleVoice() {
if isActive {
stopSession()
} else {
startSession()
}
}
private func startSession() {
// 1. Configure audio session
let audioSession = AVAudioSession.sharedInstance()
try? audioSession.setCategory(.playAndRecord, mode: .voiceChat)
try? audioSession.setActive(true)
// 2. Create voice session
let apiKey = ProcessInfo.processInfo.environment["DATAGRID_API_KEY"] ?? ""
voiceSession = VoiceSession(apiKey: apiKey)
voiceSession?.delegate = self
voiceSession?.connect()
// 3. Create audio components
audioCapture = AudioCapture()
audioPlayer = AudioPlayer()
statusText = "Connecting..."
}
private func stopSession() {
voiceSession?.stop()
audioCapture?.stop()
audioPlayer?.stop()
isActive = false
statusText = "Tap the mic to start"
}
}
extension VoiceAssistantViewModel: VoiceSessionDelegate {
nonisolated func voiceSessionDidConnect(_ session: VoiceSession) {
Task { @MainActor in
statusText = "Connected"
// Send "start" immediately after the WebSocket opens.
session.startSession(agentId: "agent_abc123")
}
}
nonisolated func voiceSession(
_ session: VoiceSession,
didStartSession sessionId: String,
conversationId: String
) {
Task { @MainActor in
statusText = "Session started"
}
}
nonisolated func voiceSessionDidBecomeReady(_ session: VoiceSession) {
Task { @MainActor in
statusText = "Listening..."
isActive = true
// Start capturing microphone and streaming to server
audioCapture?.onAudioCaptured = { [weak session] base64PCM in
session?.sendAudio(base64PCM: base64PCM)
}
try? audioCapture?.start()
}
}
nonisolated func voiceSession(_ session: VoiceSession, didReceiveAudio base64Audio: String) {
Task { @MainActor in
statusText = "Agent speaking..."
audioPlayer?.enqueue(base64Audio: base64Audio)
}
}
nonisolated func voiceSession(
_ session: VoiceSession,
didReceiveToolCall toolName: String,
status: String
) {
Task { @MainActor in
if status == "started" {
statusText = "Using \(toolName)..."
}
}
}
nonisolated func voiceSessionWasInterrupted(_ session: VoiceSession) {
Task { @MainActor in
statusText = "Listening..."
}
}
nonisolated func voiceSession(_ session: VoiceSession, didEnd payload: [String: Any]) {
Task { @MainActor in
if let transcriptItems = payload["transcript"] as? [[String: String]] {
transcript = transcriptItems.map { item in
let role = item["role"] ?? "unknown"
let text = item["text"] ?? ""
return "\(role): \(text)"
}
}
stopSession()
}
}
nonisolated func voiceSession(_ session: VoiceSession, didReceiveError message: String) {
Task { @MainActor in
statusText = "Error: \(message)"
stopSession()
}
}
}
Important Notes
Audio Formats
- Microphone input: 16-bit mono PCM, 16kHz sample rate
- Agent response: 16-bit mono PCM, 24kHz sample rate
Permissions
Add the following to yourInfo.plist:
<key>NSMicrophoneUsageDescription</key>
<string>This app needs microphone access for voice conversations.</string>
Interruption Handling
When the user starts speaking while the agent is responding, send aninterrupt message to cut off the agent’s response. You can detect this using Voice Activity Detection (VAD) or by monitoring microphone input levels.
Error Handling & Reconnection
The WebSocket connection can drop due to network issues. In production, implement:- Automatic reconnection with exponential backoff
- Graceful handling of
URLSessionWebSocketTaskdelegate errors - Audio session interruption handling (e.g., phone calls)
// Example reconnection logic
func urlSession(
_ session: URLSession,
webSocketTask: URLSessionWebSocketTask,
didCloseWith closeCode: URLSessionWebSocketTask.CloseCode,
reason: Data?
) {
if closeCode != .goingAway {
// Unexpected disconnect — attempt reconnection
DispatchQueue.main.asyncAfter(deadline: .now() + 2.0) {
self.connect()
}
}
}
Thread Safety
TheURLSessionWebSocketTask delivers callbacks on the delegate queue. Make sure to dispatch UI updates and audio operations to the appropriate threads.