Skip to main content
This guide walks through building a voice assistant in a Swift iOS app. You’ll capture microphone audio, stream it to the Datagrid Voice API over a WebSocket, and play back the agent’s audio responses in real time.
This guide uses the direct WebSocket approach — connecting straight to wss://api.datagrid.com/ws/voice. You can also use the REST endpoint (POST /v1/voice) to get a pre-built WebSocket URL and start message first.

Overview

The integration has four parts:
  1. WebSocket connection — Connect to the voice endpoint and manage the message protocol
  2. Audio capture — Record microphone input as 16-bit mono PCM at 16kHz
  3. Audio playback — Play the agent’s response audio (16-bit mono PCM at 24kHz)
  4. UI — A simple button to start/stop the conversation

1. WebSocket Client

Create a class that manages the WebSocket connection and message routing:
import Foundation

protocol VoiceSessionDelegate: AnyObject {
    func voiceSessionDidConnect(_ session: VoiceSession)
    func voiceSession(_ session: VoiceSession, didReceiveAudio base64Audio: String)
    func voiceSessionDidBecomeReady(_ session: VoiceSession)
    func voiceSession(_ session: VoiceSession, didStartSession sessionId: String, conversationId: String)
    func voiceSession(_ session: VoiceSession, didReceiveToolCall toolName: String, status: String)
    func voiceSessionWasInterrupted(_ session: VoiceSession)
    func voiceSession(_ session: VoiceSession, didEnd payload: [String: Any])
    func voiceSession(_ session: VoiceSession, didReceiveError message: String)
}

class VoiceSession: NSObject, URLSessionWebSocketDelegate {
    weak var delegate: VoiceSessionDelegate?

    private var webSocket: URLSessionWebSocketTask?
    private var urlSession: URLSession?
    private let apiKey: String
    private let baseURL: String

    init(apiKey: String, baseURL: String = "wss://api.datagrid.com") {
        self.apiKey = apiKey
        self.baseURL = baseURL
        super.init()
    }

    // MARK: - Connection

    func connect() {
        let urlString = "\(baseURL)/ws/voice?token=\(apiKey)"
        guard let url = URL(string: urlString) else { return }

        urlSession = URLSession(
            configuration: .default,
            delegate: self,
            delegateQueue: .main
        )
        webSocket = urlSession?.webSocketTask(with: url)
        webSocket?.resume()
        listenForMessages()
    }

    func disconnect() {
        webSocket?.cancel(with: .goingAway, reason: nil)
        webSocket = nil
    }

    // MARK: - URLSessionWebSocketDelegate

    func urlSession(
        _ session: URLSession,
        webSocketTask: URLSessionWebSocketTask,
        didOpenWithProtocol protocol: String?
    ) {
        delegate?.voiceSessionDidConnect(self)
    }

    // MARK: - Sending Messages

    func startSession(agentId: String? = nil, conversationId: String? = nil) {
        var payload: [String: Any] = [:]
        if let agentId { payload["agent_id"] = agentId }
        if let conversationId { payload["conversation_id"] = conversationId }

        send(type: "start", payload: payload)
    }

    func sendAudio(base64PCM: String) {
        send(type: "audio", payload: ["data": base64PCM])
    }

    func stop() {
        send(type: "stop")
    }

    func interrupt() {
        send(type: "interrupt")
    }

    // MARK: - Private

    private func send(type: String, payload: [String: Any]? = nil) {
        var message: [String: Any] = ["type": type]
        if let payload { message["payload"] = payload }

        guard let data = try? JSONSerialization.data(withJSONObject: message),
              let string = String(data: data, encoding: .utf8) else { return }

        webSocket?.send(.string(string)) { error in
            if let error {
                print("[VoiceSession] Send error: \(error.localizedDescription)")
            }
        }
    }

    private func listenForMessages() {
        webSocket?.receive { [weak self] result in
            guard let self else { return }

            switch result {
            case .success(let message):
                switch message {
                case .string(let text):
                    self.handleMessage(text)
                case .data(let data):
                    if let text = String(data: data, encoding: .utf8) {
                        self.handleMessage(text)
                    }
                @unknown default:
                    break
                }
                // Continue listening
                self.listenForMessages()

            case .failure(let error):
                print("[VoiceSession] Receive error: \(error.localizedDescription)")
            }
        }
    }

    private func handleMessage(_ text: String) {
        guard let data = text.data(using: .utf8),
              let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
              let type = json["type"] as? String else { return }

        let payload = json["payload"] as? [String: Any]

        switch type {
        case "started":
            let sessionId = payload?["session_id"] as? String ?? ""
            let conversationId = payload?["conversation_id"] as? String ?? ""
            delegate?.voiceSession(self, didStartSession: sessionId, conversationId: conversationId)

        case "ready":
            delegate?.voiceSessionDidBecomeReady(self)

        case "audio":
            if let audioData = payload?["data"] as? String {
                delegate?.voiceSession(self, didReceiveAudio: audioData)
            }

        case "tool_call":
            let toolName = payload?["tool_name"] as? String ?? ""
            let status = payload?["status"] as? String ?? ""
            delegate?.voiceSession(self, didReceiveToolCall: toolName, status: status)

        case "interrupted":
            delegate?.voiceSessionWasInterrupted(self)

        case "error":
            let message = payload?["message"] as? String ?? "Unknown error"
            delegate?.voiceSession(self, didReceiveError: message)

        case "ended":
            delegate?.voiceSession(self, didEnd: payload ?? [:])

        default:
            print("[VoiceSession] Unknown message type: \(type)")
        }
    }
}

2. Audio Capture (Microphone)

Use AVAudioEngine to capture microphone audio and convert it to 16-bit PCM:
import AVFoundation

class AudioCapture {
    private let audioEngine = AVAudioEngine()
    var onAudioCaptured: ((String) -> Void)? // base64 PCM callback

    func start() throws {
        let inputNode = audioEngine.inputNode
        let recordingFormat = AVAudioFormat(
            commonFormat: .pcmFormatInt16,
            sampleRate: 16000,
            channels: 1,
            interleaved: true
        )!

        // Install a tap to capture audio buffers
        inputNode.installTap(
            onBus: 0,
            bufferSize: 4096,
            format: recordingFormat
        ) { [weak self] buffer, _ in
            guard let self,
                  let channelData = buffer.int16ChannelData else { return }

            let frameCount = Int(buffer.frameLength)
            let data = Data(
                bytes: channelData.pointee,
                count: frameCount * MemoryLayout<Int16>.size
            )
            let base64 = data.base64EncodedString()
            self.onAudioCaptured?(base64)
        }

        try audioEngine.start()
    }

    func stop() {
        audioEngine.inputNode.removeTap(onBus: 0)
        audioEngine.stop()
    }
}

3. Audio Playback

Use AVAudioPlayerNode to play back the agent’s PCM audio in real time:
import AVFoundation

class AudioPlayer {
    private let audioEngine = AVAudioEngine()
    private let playerNode = AVAudioPlayerNode()
    private let playbackFormat: AVAudioFormat

    init() {
        // Agent audio is 16-bit mono PCM at 24kHz
        playbackFormat = AVAudioFormat(
            commonFormat: .pcmFormatInt16,
            sampleRate: 24000,
            channels: 1,
            interleaved: true
        )!

        audioEngine.attach(playerNode)
        audioEngine.connect(playerNode, to: audioEngine.mainMixerNode, format: playbackFormat)
        try? audioEngine.start()
        playerNode.play()
    }

    func enqueue(base64Audio: String) {
        guard let audioData = Data(base64Encoded: base64Audio) else { return }

        let frameCount = UInt32(audioData.count / MemoryLayout<Int16>.size)
        guard let buffer = AVAudioPCMBuffer(
            pcmFormat: playbackFormat,
            frameCapacity: frameCount
        ) else { return }

        buffer.frameLength = frameCount

        audioData.withUnsafeBytes { rawBuffer in
            if let src = rawBuffer.baseAddress {
                memcpy(buffer.int16ChannelData!.pointee, src, audioData.count)
            }
        }

        playerNode.scheduleBuffer(buffer)
    }

    func stop() {
        playerNode.stop()
        audioEngine.stop()
    }
}

4. Putting It All Together

Here’s a SwiftUI view that ties everything together:
import SwiftUI
import AVFoundation

struct VoiceAssistantView: View {
    @StateObject private var viewModel = VoiceAssistantViewModel()

    var body: some View {
        VStack(spacing: 24) {
            Text(viewModel.statusText)
                .font(.headline)
                .foregroundColor(.secondary)

            Button(action: { viewModel.toggleVoice() }) {
                Image(systemName: viewModel.isActive ? "mic.fill" : "mic")
                    .font(.system(size: 48))
                    .foregroundColor(viewModel.isActive ? .red : .blue)
            }
            .padding()

            if !viewModel.transcript.isEmpty {
                ScrollView {
                    LazyVStack(alignment: .leading, spacing: 8) {
                        ForEach(viewModel.transcript, id: \.self) { line in
                            Text(line)
                                .font(.body)
                        }
                    }
                    .padding()
                }
            }
        }
        .padding()
    }
}

@MainActor
class VoiceAssistantViewModel: ObservableObject {
    @Published var isActive = false
    @Published var statusText = "Tap the mic to start"
    @Published var transcript: [String] = []

    private var voiceSession: VoiceSession?
    private var audioCapture: AudioCapture?
    private var audioPlayer: AudioPlayer?

    func toggleVoice() {
        if isActive {
            stopSession()
        } else {
            startSession()
        }
    }

    private func startSession() {
        // 1. Configure audio session
        let audioSession = AVAudioSession.sharedInstance()
        try? audioSession.setCategory(.playAndRecord, mode: .voiceChat)
        try? audioSession.setActive(true)

        // 2. Create voice session
        let apiKey = ProcessInfo.processInfo.environment["DATAGRID_API_KEY"] ?? ""
        voiceSession = VoiceSession(apiKey: apiKey)
        voiceSession?.delegate = self
        voiceSession?.connect()

        // 3. Create audio components
        audioCapture = AudioCapture()
        audioPlayer = AudioPlayer()

        statusText = "Connecting..."
    }

    private func stopSession() {
        voiceSession?.stop()
        audioCapture?.stop()
        audioPlayer?.stop()
        isActive = false
        statusText = "Tap the mic to start"
    }
}

extension VoiceAssistantViewModel: VoiceSessionDelegate {
    nonisolated func voiceSessionDidConnect(_ session: VoiceSession) {
        Task { @MainActor in
            statusText = "Connected"
            // Send "start" immediately after the WebSocket opens.
            session.startSession(agentId: "agent_abc123")
        }
    }

    nonisolated func voiceSession(
        _ session: VoiceSession,
        didStartSession sessionId: String,
        conversationId: String
    ) {
        Task { @MainActor in
            statusText = "Session started"
        }
    }

    nonisolated func voiceSessionDidBecomeReady(_ session: VoiceSession) {
        Task { @MainActor in
            statusText = "Listening..."
            isActive = true

            // Start capturing microphone and streaming to server
            audioCapture?.onAudioCaptured = { [weak session] base64PCM in
                session?.sendAudio(base64PCM: base64PCM)
            }
            try? audioCapture?.start()
        }
    }

    nonisolated func voiceSession(_ session: VoiceSession, didReceiveAudio base64Audio: String) {
        Task { @MainActor in
            statusText = "Agent speaking..."
            audioPlayer?.enqueue(base64Audio: base64Audio)
        }
    }

    nonisolated func voiceSession(
        _ session: VoiceSession,
        didReceiveToolCall toolName: String,
        status: String
    ) {
        Task { @MainActor in
            if status == "started" {
                statusText = "Using \(toolName)..."
            }
        }
    }

    nonisolated func voiceSessionWasInterrupted(_ session: VoiceSession) {
        Task { @MainActor in
            statusText = "Listening..."
        }
    }

    nonisolated func voiceSession(_ session: VoiceSession, didEnd payload: [String: Any]) {
        Task { @MainActor in
            if let transcriptItems = payload["transcript"] as? [[String: String]] {
                transcript = transcriptItems.map { item in
                    let role = item["role"] ?? "unknown"
                    let text = item["text"] ?? ""
                    return "\(role): \(text)"
                }
            }
            stopSession()
        }
    }

    nonisolated func voiceSession(_ session: VoiceSession, didReceiveError message: String) {
        Task { @MainActor in
            statusText = "Error: \(message)"
            stopSession()
        }
    }
}

Important Notes

Audio Formats

  • Microphone input: 16-bit mono PCM, 16kHz sample rate
  • Agent response: 16-bit mono PCM, 24kHz sample rate

Permissions

Add the following to your Info.plist:
<key>NSMicrophoneUsageDescription</key>
<string>This app needs microphone access for voice conversations.</string>

Interruption Handling

When the user starts speaking while the agent is responding, send an interrupt message to cut off the agent’s response. You can detect this using Voice Activity Detection (VAD) or by monitoring microphone input levels.

Error Handling & Reconnection

The WebSocket connection can drop due to network issues. In production, implement:
  • Automatic reconnection with exponential backoff
  • Graceful handling of URLSessionWebSocketTask delegate errors
  • Audio session interruption handling (e.g., phone calls)
// Example reconnection logic
func urlSession(
    _ session: URLSession,
    webSocketTask: URLSessionWebSocketTask,
    didCloseWith closeCode: URLSessionWebSocketTask.CloseCode,
    reason: Data?
) {
    if closeCode != .goingAway {
        // Unexpected disconnect — attempt reconnection
        DispatchQueue.main.asyncAfter(deadline: .now() + 2.0) {
            self.connect()
        }
    }
}

Thread Safety

The URLSessionWebSocketTask delivers callbacks on the delegate queue. Make sure to dispatch UI updates and audio operations to the appropriate threads.