From dd2d84a08009cc465dda8352bbf4be553463a88c Mon Sep 17 00:00:00 2001
From: Chris Spencer <spencercw@gmail.com>
Date: Wed, 26 Jan 2022 20:55:24 +0000
Subject: [PATCH] [client] audio: adjust playback speed to match audio device
 clock

This change is based on the techniques described in [1] and [2].

The input audio stream from Spice is not synchronised to the audio playback
device. While the input and output may be both nominally running at 48 kHz,
when compared against each other, they will differ by a tiny fraction of a
percent. Given enough time (typically on the order of a few hours), this
will result in the ring buffer becoming completely full or completely
empty. It will stay in this state permanently, periodically resulting in
glitches as the buffer repeatedly underruns or overruns.

To address this, adjust the speed of the received data to match the rate at
which it is being consumed by the audio device. This will result in a
slight pitch shift, but the changes should be small and smooth enough that
this is unnoticeable to the user.

The process works roughly as follows:
1. Every time audio data is received from Spice, or consumed by the audio
   device, sample the current time. These are fed into a pair of delay
   locked loops to produce smoothed approximations of the two clocks.
2. Compute the difference between the two clocks and compare this against
   the target latency to produce an error value. This error value will be
   quite stable during normal operation, but can change quite rapidly due
   to external factors, particularly at the start of playback. To smooth
   out any sudden changes in playback speed, which would be noticeable to
   the user, this value is also filtered through another delay locked loop.
3. Feed this error value into a PI controller to produce a ratio value.
   This is the target playback speed in order to bring the error value
   towards zero.
4. Resample the input audio using the computed ratio to apply the speed
   change. The output of the resampler is what is ultimately inserted into
   the ring buffer for consumption by the audio device.

Since this process targets a specific latency value, rather than simply
trying to rate match the input and output, it also has the effect of
'correcting' latency issues. If a high latency application (such as a media
player) is already running, the time between requesting the start of
playback and the audio device actually starting to consume samples can be
very high, easily in the hundreds of milliseconds. The changes here will
automatically adjust the playback speed over the course of a few minutes to
bring the latency back down to the target value.

[1] https://kokkinizita.linuxaudio.org/papers/adapt-resamp.pdf
[2] https://kokkinizita.linuxaudio.org/papers/usingdll.pdf
---
 client/CMakeLists.txt                    |   2 +
 client/audiodevs/PipeWire/pipewire.c     |   4 +-
 client/audiodevs/PulseAudio/pulseaudio.c |   9 +-
 client/include/interface/audiodev.h      |   2 +-
 client/src/audio.c                       | 319 ++++++++++++++++++++++-
 5 files changed, 318 insertions(+), 18 deletions(-)

diff --git a/client/CMakeLists.txt b/client/CMakeLists.txt
index ff3d4c37..4e003972 100644
--- a/client/CMakeLists.txt
+++ b/client/CMakeLists.txt
@@ -26,6 +26,7 @@ include(UninstallTarget)
 
 find_package(PkgConfig)
 pkg_check_modules(FONTCONFIG REQUIRED IMPORTED_TARGET fontconfig)
+pkg_check_modules(SAMPLERATE REQUIRED IMPORTED_TARGET samplerate)
 
 option(ENABLE_OPENGL "Enable the OpenGL renderer"       ON)
 add_feature_info(ENABLE_OPENGL ENABLE_OPENGL "Legacy OpenGL renderer.")
@@ -160,6 +161,7 @@ target_compile_definitions(looking-glass-client PRIVATE CIMGUI_DEFINE_ENUMS_AND_
 target_link_libraries(looking-glass-client
 	${EXE_FLAGS}
 	PkgConfig::FONTCONFIG
+	PkgConfig::SAMPLERATE
 	lg_common
 	displayservers
 	lgmp
diff --git a/client/audiodevs/PipeWire/pipewire.c b/client/audiodevs/PipeWire/pipewire.c
index 5d12dffb..eb726c5d 100644
--- a/client/audiodevs/PipeWire/pipewire.c
+++ b/client/audiodevs/PipeWire/pipewire.c
@@ -207,7 +207,7 @@ static void pipewire_playbackSetup(int channels, int sampleRate,
 
   pw.playback.channels    = channels;
   pw.playback.sampleRate  = sampleRate;
-  pw.playback.stride      = sizeof(uint16_t) * channels;
+  pw.playback.stride      = sizeof(float) * channels;
   pw.playback.pullFn      = pullFn;
   pw.playback.startFrames = maxLatencyFrames;
 
@@ -236,7 +236,7 @@ static void pipewire_playbackSetup(int channels, int sampleRate,
 
   params[0] = spa_format_audio_raw_build(&b, SPA_PARAM_EnumFormat,
       &SPA_AUDIO_INFO_RAW_INIT(
-        .format   = SPA_AUDIO_FORMAT_S16,
+        .format   = SPA_AUDIO_FORMAT_F32,
         .channels = channels,
         .rate     = sampleRate
         ));
diff --git a/client/audiodevs/PulseAudio/pulseaudio.c b/client/audiodevs/PulseAudio/pulseaudio.c
index c313ba46..34fd2768 100644
--- a/client/audiodevs/PulseAudio/pulseaudio.c
+++ b/client/audiodevs/PulseAudio/pulseaudio.c
@@ -220,6 +220,11 @@ static void pulseaudio_free(void)
 
 static void pulseaudio_write_cb(pa_stream * p, size_t nbytes, void * userdata)
 {
+  // PulseAudio tries to pull data from the stream as soon as it is created for
+  // some reason, even though it is corked
+  if (pa.sinkCorked)
+    return;
+
   uint8_t * dst;
 
   pa_stream_begin_write(p, (void **)&dst, &nbytes);
@@ -250,7 +255,7 @@ static void pulseaudio_setup(int channels, int sampleRate,
   const int PERIOD_LEN = 80;
 
   pa_sample_spec spec = {
-    .format   = PA_SAMPLE_S16LE,
+    .format   = PA_SAMPLE_FLOAT32,
     .rate     = sampleRate,
     .channels = channels
   };
@@ -279,7 +284,7 @@ static void pulseaudio_setup(int channels, int sampleRate,
     PA_STREAM_START_CORKED | PA_STREAM_ADJUST_LATENCY,
     NULL, NULL);
 
-  pa.sinkStride = channels * sizeof(uint16_t);
+  pa.sinkStride = channels * sizeof(float);
   pa.sinkPullFn = pullFn;
   pa.sinkStart  = attribs.tlength / pa.sinkStride;
   pa.sinkCorked = true;
diff --git a/client/include/interface/audiodev.h b/client/include/interface/audiodev.h
index 92952f77..1c04de6a 100644
--- a/client/include/interface/audiodev.h
+++ b/client/include/interface/audiodev.h
@@ -45,7 +45,7 @@ struct LG_AudioDevOps
   struct
   {
     /* setup the stream for playback but don't start it yet
-     * Note: currently SPICE only supports S16 samples so always assume so
+     * Note: the pull function returns f32 samples
      */
     void (*setup)(int channels, int sampleRate, LG_AudioPullFn pullFn);
 
diff --git a/client/src/audio.c b/client/src/audio.c
index e8f2d1d9..cac1a6d3 100644
--- a/client/src/audio.c
+++ b/client/src/audio.c
@@ -26,6 +26,9 @@
 
 #include "dynamic/audiodev.h"
 
+#include <math.h>
+#include <samplerate.h>
+#include <stdalign.h>
 #include <string.h>
 
 typedef enum
@@ -40,6 +43,45 @@ StreamState;
 #define STREAM_ACTIVE(state) \
   (state == STREAM_STATE_SETUP || state == STREAM_STATE_RUN)
 
+typedef struct
+{
+  int     periodFrames;
+  double  periodSec;
+  int64_t nextTime;
+  int64_t nextPosition;
+  double  b;
+  double  c;
+}
+PlaybackDeviceData;
+
+typedef struct
+{
+  float * framesIn;
+  float * framesOut;
+  int     framesOutSize;
+
+  int     periodFrames;
+  double  periodSec;
+  int64_t nextTime;
+  int64_t nextPosition;
+  double  b;
+  double  c;
+
+  int64_t devLastTime;
+  int64_t devNextTime;
+
+  int64_t devLastPosition;
+  int64_t devNextPosition;
+
+  double  offsetError;
+  double  offsetErrorIntegral;
+
+  double  ratioIntegral;
+
+  SRC_STATE * src;
+}
+PlaybackSpiceData;
+
 typedef struct
 {
   struct LG_AudioDevOps * audioDev;
@@ -50,13 +92,21 @@ typedef struct
     int         volumeChannels;
     uint16_t    volume[8];
     bool        mute;
+    int         channels;
     int         sampleRate;
     int         stride;
     RingBuffer  buffer;
+    RingBuffer  deviceTiming;
 
     LG_Lock     lock;
     RingBuffer  timings;
     GraphHandle graph;
+
+    // These two structs contain data specifically for use in the device and
+    // Spice data threads respectively. Keep them on separate cache lines to
+    // avoid false sharing
+    alignas(64) PlaybackDeviceData deviceData;
+    alignas(64) PlaybackSpiceData  spiceData;
   }
   playback;
 
@@ -75,6 +125,13 @@ AudioState;
 
 static AudioState audio = { 0 };
 
+typedef struct
+{
+  int64_t nextTime;
+  int64_t nextPosition;
+}
+PlaybackDeviceTick;
+
 static void playbackStopNL(void);
 
 void audio_init(void)
@@ -132,6 +189,16 @@ static void playbackStopNL(void)
   audio.playback.state = STREAM_STATE_STOP;
   audio.audioDev->playback.stop();
   ringbuffer_free(&audio.playback.buffer);
+  ringbuffer_free(&audio.playback.deviceTiming);
+  audio.playback.spiceData.src = src_delete(audio.playback.spiceData.src);
+
+  if (audio.playback.spiceData.framesIn)
+  {
+    free(audio.playback.spiceData.framesIn);
+    free(audio.playback.spiceData.framesOut);
+    audio.playback.spiceData.framesIn = NULL;
+    audio.playback.spiceData.framesOut = NULL;
+  }
 
   if (audio.playback.timings)
   {
@@ -142,13 +209,68 @@ static void playbackStopNL(void)
 
 static int playbackPullFrames(uint8_t * dst, int frames)
 {
+  DEBUG_ASSERT(frames >= 0);
+  if (frames == 0)
+    return frames;
+
+  PlaybackDeviceData * data = &audio.playback.deviceData;
+  int64_t now = nanotime();
+
   if (audio.playback.buffer)
-    frames = ringbuffer_consume(audio.playback.buffer, dst, frames);
+  {
+    // Measure the device clock and post to the Spice thread
+    if (frames != data->periodFrames)
+    {
+      bool init = data->periodFrames == 0;
+      if (init)
+        data->nextTime = now;
+
+      data->periodFrames  = frames;
+      data->periodSec     = (double) frames / audio.playback.sampleRate;
+      data->nextTime     += llrint(data->periodSec * 1.0e9);
+      data->nextPosition += frames;
+
+      double bandwidth = 0.05;
+      double omega = 2.0 * M_PI * bandwidth * data->periodSec;
+      data->b = M_SQRT2 * omega;
+      data->c = omega * omega;
+    }
+    else
+    {
+      double error = (now - data->nextTime) * 1.0e-9;
+      if (fabs(error) >= 0.2)
+      {
+        // Clock error is too high; slew the read pointer and reset the timing
+        // parameters to avoid getting too far out of sync
+        int slewFrames = round(error * audio.playback.sampleRate);
+        ringbuffer_consume(audio.playback.buffer, NULL, slewFrames);
+
+        data->periodSec     = (double) frames / audio.playback.sampleRate;
+        data->nextTime      = now + llrint(data->periodSec * 1.0e9);
+        data->nextPosition += slewFrames + frames;
+      }
+      else
+      {
+        data->nextTime     +=
+          llrint((data->b * error + data->periodSec) * 1.0e9);
+        data->periodSec    += data->c * error;
+        data->nextPosition += frames;
+      }
+    }
+
+    PlaybackDeviceTick tick = {
+      .nextTime     = data->nextTime,
+      .nextPosition = data->nextPosition
+    };
+    ringbuffer_append(audio.playback.deviceTiming, &tick, 1);
+
+    ringbuffer_consume(audio.playback.buffer, dst, frames);
+  }
   else
     frames = 0;
 
   if (audio.playback.state == STREAM_STATE_DRAIN &&
-      ringbuffer_getCount(audio.playback.buffer) == 0)
+      ringbuffer_getCount(audio.playback.buffer) <= 0)
   {
     LG_LOCK(audio.playback.lock);
     playbackStopNL();
@@ -172,18 +294,37 @@ void audio_playbackStart(int channels, int sampleRate, PSAudioFormat format,
     playbackStopNL();
   }
 
-  // Using a bounded ring buffer for now. We are not currently doing anything to
-  // keep the input and output in sync, so if we were using an unbounded buffer,
-  // it would eventually end up in a state of permanent underrun or overrun and
-  // the user would only hear silence
-  const int bufferFrames = sampleRate;
-  audio.playback.buffer = ringbuffer_new(bufferFrames,
-      channels * sizeof(uint16_t));
+  int srcError;
+  audio.playback.spiceData.src =
+    src_new(SRC_SINC_BEST_QUALITY, channels, &srcError);
+  if (!audio.playback.spiceData.src)
+  {
+    DEBUG_ERROR("Failed to create resampler: %s", src_strerror(srcError));
+    goto done;
+  }
 
+  const int bufferFrames = sampleRate;
+  audio.playback.buffer = ringbuffer_newUnbounded(bufferFrames,
+      channels * sizeof(float));
+
+  audio.playback.deviceTiming = ringbuffer_new(16, sizeof(PlaybackDeviceTick));
+
+  audio.playback.channels   = channels;
   audio.playback.sampleRate = sampleRate;
-  audio.playback.stride     = channels * sizeof(uint16_t);
+  audio.playback.stride     = channels * sizeof(float);
   audio.playback.state      = STREAM_STATE_SETUP;
 
+  audio.playback.deviceData.periodFrames       = 0;
+  audio.playback.deviceData.nextPosition       = 0;
+
+  audio.playback.spiceData.periodFrames        = 0;
+  audio.playback.spiceData.nextPosition        = 0;
+  audio.playback.spiceData.devLastTime         = INT64_MIN;
+  audio.playback.spiceData.devNextTime         = INT64_MIN;
+  audio.playback.spiceData.offsetError         = 0.0;
+  audio.playback.spiceData.offsetErrorIntegral = 0.0;
+  audio.playback.spiceData.ratioIntegral       = 0.0;
+
   audio.audioDev->playback.setup(channels, sampleRate, playbackPullFrames);
 
   // if a volume level was stored, set it before we return
@@ -203,6 +344,7 @@ void audio_playbackStart(int channels, int sampleRate, PSAudioFormat format,
 
   audio.playback.state = STREAM_STATE_SETUP;
 
+done:
   LG_UNLOCK(audio.playback.lock);
 }
 
@@ -246,14 +388,165 @@ void audio_playbackMute(bool mute)
 
 void audio_playbackData(uint8_t * data, size_t size)
 {
-  if (!audio.audioDev)
+  if (!audio.audioDev || size == 0)
     return;
 
   if (!STREAM_ACTIVE(audio.playback.state))
     return;
 
-  int frames = size / audio.playback.stride;
-  ringbuffer_append(audio.playback.buffer, data, frames);
+  PlaybackSpiceData * spiceData = &audio.playback.spiceData;
+  int64_t now = nanotime();
+
+  // Convert from s16 to f32 samples
+  int spiceStride = audio.playback.channels * sizeof(int16_t);
+  int frames = size / spiceStride;
+  bool periodChanged = frames != spiceData->periodFrames;
+  bool init = spiceData->periodFrames == 0;
+
+  if (periodChanged)
+  {
+    if (spiceData->framesIn)
+    {
+      free(spiceData->framesIn);
+      free(spiceData->framesOut);
+    }
+    spiceData->periodFrames  = frames;
+    spiceData->framesIn      = malloc(frames * audio.playback.stride);
+
+    spiceData->framesOutSize = round(frames * 1.1);
+    spiceData->framesOut     =
+      malloc(spiceData->framesOutSize * audio.playback.stride);
+  }
+
+  src_short_to_float_array((int16_t *) data, spiceData->framesIn,
+    frames * audio.playback.channels);
+
+  // Receive timing information from the audio device thread
+  PlaybackDeviceTick deviceTick;
+  while (ringbuffer_consume(audio.playback.deviceTiming, &deviceTick, 1))
+  {
+    spiceData->devLastTime     = spiceData->devNextTime;
+    spiceData->devLastPosition = spiceData->devNextPosition;
+    spiceData->devNextTime     = deviceTick.nextTime;
+    spiceData->devNextPosition = deviceTick.nextPosition;
+  }
+
+  // Measure the Spice audio clock
+  int64_t curTime;
+  int64_t curPosition;
+  if (periodChanged)
+  {
+    if (init)
+      spiceData->nextTime = now;
+
+    curTime     = spiceData->nextTime;
+    curPosition = spiceData->nextPosition;
+
+    spiceData->periodSec = (double) frames / audio.playback.sampleRate;
+    spiceData->nextTime += llrint(spiceData->periodSec * 1.0e9);
+
+    double bandwidth = 0.05;
+    double omega = 2.0 * M_PI * bandwidth * spiceData->periodSec;
+    spiceData->b = M_SQRT2 * omega;
+    spiceData->c = omega * omega;
+  }
+  else
+  {
+    double error = (now - spiceData->nextTime) * 1.0e-9;
+    if (fabs(error) >= 0.2)
+    {
+      // Clock error is too high; slew the write pointer and reset the timing
+      // parameters to avoid getting too far out of sync
+      int slewFrames = round(error * audio.playback.sampleRate);
+      ringbuffer_append(audio.playback.buffer, NULL, slewFrames);
+
+      curTime     = now;
+      curPosition = spiceData->nextPosition + slewFrames;
+
+      spiceData->periodSec    = (double) frames / audio.playback.sampleRate;
+      spiceData->nextTime     = now + llrint(spiceData->periodSec * 1.0e9);
+      spiceData->nextPosition = curPosition;
+    }
+    else
+    {
+      curTime     = spiceData->nextTime;
+      curPosition = spiceData->nextPosition;
+
+      spiceData->nextTime  +=
+        llrint((spiceData->b * error + spiceData->periodSec) * 1.0e9);
+      spiceData->periodSec += spiceData->c * error;
+    }
+  }
+
+  // Measure the offset between the Spice position and the device position,
+  // and how far away this is from the target latency. We use this to adjust
+  // the playback speed to bring them back in line. This value can change
+  // quite rapidly, particularly at the start of playback, so filter it to
+  // avoid sudden pitch shifts which will be noticeable to the user.
+  double offsetError = spiceData->offsetError;
+  if (spiceData->devLastTime != INT64_MIN)
+  {
+    // Interpolate to calculate the current device position
+    double devPosition = spiceData->devLastPosition +
+      (spiceData->devNextPosition - spiceData->devLastPosition) *
+        ((double) (curTime - spiceData->devLastTime) /
+          (spiceData->devNextTime - spiceData->devLastTime));
+
+    // Target latency derived experimentally to avoid underruns. This could be
+    // reduced with more tuning. We could adjust on the fly based upon the
+    // device period size, but that would result in underruns if the period size
+    // suddenly increases. It may be better instead to just reduce the maximum
+    // latency on the audio devices, which currently is set quite high
+    int targetLatencyMs = 70;
+    int targetLatencyFrames =
+      targetLatencyMs * audio.playback.sampleRate / 1000;
+
+    double actualOffset = curPosition - devPosition;
+    double actualOffsetError = -(actualOffset - targetLatencyFrames);
+
+    double error = actualOffsetError - offsetError;
+    spiceData->offsetError += spiceData->b * error +
+      spiceData->offsetErrorIntegral;
+    spiceData->offsetErrorIntegral += spiceData->c * error;
+  }
+
+  // Resample the audio to adjust the playback speed. Use a PI controller to
+  // adjust the resampling ratio based upon the measured offset
+  double kp = 0.5e-6;
+  double ki = 1.0e-16;
+
+  spiceData->ratioIntegral += offsetError * spiceData->periodSec;
+
+  double piOutput = kp * offsetError + ki * spiceData->ratioIntegral;
+  double ratio = 1.0 + piOutput;
+
+  int consumed = 0;
+  while (consumed < frames)
+  {
+    SRC_DATA srcData = {
+      .data_in = spiceData->framesIn + consumed * audio.playback.channels,
+      .data_out = spiceData->framesOut,
+      .input_frames = frames - consumed,
+      .output_frames = spiceData->framesOutSize,
+      .input_frames_used = 0,
+      .output_frames_gen = 0,
+      .end_of_input = 0,
+      .src_ratio = ratio
+    };
+
+    int error = src_process(spiceData->src, &srcData);
+    if (error)
+    {
+      DEBUG_ERROR("Resampling failed: %s", src_strerror(error));
+      return;
+    }
+
+    ringbuffer_append(audio.playback.buffer, spiceData->framesOut,
+      srcData.output_frames_gen);
+
+    consumed += srcData.input_frames_used;
+    spiceData->nextPosition += srcData.output_frames_gen;
+  }
 
   if (audio.playback.state == STREAM_STATE_SETUP)
   {