From dd2d84a08009cc465dda8352bbf4be553463a88c Mon Sep 17 00:00:00 2001 From: Chris Spencer Date: Wed, 26 Jan 2022 20:55:24 +0000 Subject: [PATCH] [client] audio: adjust playback speed to match audio device clock This change is based on the techniques described in [1] and [2]. The input audio stream from Spice is not synchronised to the audio playback device. While the input and output may be both nominally running at 48 kHz, when compared against each other, they will differ by a tiny fraction of a percent. Given enough time (typically on the order of a few hours), this will result in the ring buffer becoming completely full or completely empty. It will stay in this state permanently, periodically resulting in glitches as the buffer repeatedly underruns or overruns. To address this, adjust the speed of the received data to match the rate at which it is being consumed by the audio device. This will result in a slight pitch shift, but the changes should be small and smooth enough that this is unnoticeable to the user. The process works roughly as follows: 1. Every time audio data is received from Spice, or consumed by the audio device, sample the current time. These are fed into a pair of delay locked loops to produce smoothed approximations of the two clocks. 2. Compute the difference between the two clocks and compare this against the target latency to produce an error value. This error value will be quite stable during normal operation, but can change quite rapidly due to external factors, particularly at the start of playback. To smooth out any sudden changes in playback speed, which would be noticeable to the user, this value is also filtered through another delay locked loop. 3. Feed this error value into a PI controller to produce a ratio value. This is the target playback speed in order to bring the error value towards zero. 4. Resample the input audio using the computed ratio to apply the speed change. The output of the resampler is what is ultimately inserted into the ring buffer for consumption by the audio device. Since this process targets a specific latency value, rather than simply trying to rate match the input and output, it also has the effect of 'correcting' latency issues. If a high latency application (such as a media player) is already running, the time between requesting the start of playback and the audio device actually starting to consume samples can be very high, easily in the hundreds of milliseconds. The changes here will automatically adjust the playback speed over the course of a few minutes to bring the latency back down to the target value. [1] https://kokkinizita.linuxaudio.org/papers/adapt-resamp.pdf [2] https://kokkinizita.linuxaudio.org/papers/usingdll.pdf --- client/CMakeLists.txt | 2 + client/audiodevs/PipeWire/pipewire.c | 4 +- client/audiodevs/PulseAudio/pulseaudio.c | 9 +- client/include/interface/audiodev.h | 2 +- client/src/audio.c | 319 ++++++++++++++++++++++- 5 files changed, 318 insertions(+), 18 deletions(-) diff --git a/client/CMakeLists.txt b/client/CMakeLists.txt index ff3d4c37..4e003972 100644 --- a/client/CMakeLists.txt +++ b/client/CMakeLists.txt @@ -26,6 +26,7 @@ include(UninstallTarget) find_package(PkgConfig) pkg_check_modules(FONTCONFIG REQUIRED IMPORTED_TARGET fontconfig) +pkg_check_modules(SAMPLERATE REQUIRED IMPORTED_TARGET samplerate) option(ENABLE_OPENGL "Enable the OpenGL renderer" ON) add_feature_info(ENABLE_OPENGL ENABLE_OPENGL "Legacy OpenGL renderer.") @@ -160,6 +161,7 @@ target_compile_definitions(looking-glass-client PRIVATE CIMGUI_DEFINE_ENUMS_AND_ target_link_libraries(looking-glass-client ${EXE_FLAGS} PkgConfig::FONTCONFIG + PkgConfig::SAMPLERATE lg_common displayservers lgmp diff --git a/client/audiodevs/PipeWire/pipewire.c b/client/audiodevs/PipeWire/pipewire.c index 5d12dffb..eb726c5d 100644 --- a/client/audiodevs/PipeWire/pipewire.c +++ b/client/audiodevs/PipeWire/pipewire.c @@ -207,7 +207,7 @@ static void pipewire_playbackSetup(int channels, int sampleRate, pw.playback.channels = channels; pw.playback.sampleRate = sampleRate; - pw.playback.stride = sizeof(uint16_t) * channels; + pw.playback.stride = sizeof(float) * channels; pw.playback.pullFn = pullFn; pw.playback.startFrames = maxLatencyFrames; @@ -236,7 +236,7 @@ static void pipewire_playbackSetup(int channels, int sampleRate, params[0] = spa_format_audio_raw_build(&b, SPA_PARAM_EnumFormat, &SPA_AUDIO_INFO_RAW_INIT( - .format = SPA_AUDIO_FORMAT_S16, + .format = SPA_AUDIO_FORMAT_F32, .channels = channels, .rate = sampleRate )); diff --git a/client/audiodevs/PulseAudio/pulseaudio.c b/client/audiodevs/PulseAudio/pulseaudio.c index c313ba46..34fd2768 100644 --- a/client/audiodevs/PulseAudio/pulseaudio.c +++ b/client/audiodevs/PulseAudio/pulseaudio.c @@ -220,6 +220,11 @@ static void pulseaudio_free(void) static void pulseaudio_write_cb(pa_stream * p, size_t nbytes, void * userdata) { + // PulseAudio tries to pull data from the stream as soon as it is created for + // some reason, even though it is corked + if (pa.sinkCorked) + return; + uint8_t * dst; pa_stream_begin_write(p, (void **)&dst, &nbytes); @@ -250,7 +255,7 @@ static void pulseaudio_setup(int channels, int sampleRate, const int PERIOD_LEN = 80; pa_sample_spec spec = { - .format = PA_SAMPLE_S16LE, + .format = PA_SAMPLE_FLOAT32, .rate = sampleRate, .channels = channels }; @@ -279,7 +284,7 @@ static void pulseaudio_setup(int channels, int sampleRate, PA_STREAM_START_CORKED | PA_STREAM_ADJUST_LATENCY, NULL, NULL); - pa.sinkStride = channels * sizeof(uint16_t); + pa.sinkStride = channels * sizeof(float); pa.sinkPullFn = pullFn; pa.sinkStart = attribs.tlength / pa.sinkStride; pa.sinkCorked = true; diff --git a/client/include/interface/audiodev.h b/client/include/interface/audiodev.h index 92952f77..1c04de6a 100644 --- a/client/include/interface/audiodev.h +++ b/client/include/interface/audiodev.h @@ -45,7 +45,7 @@ struct LG_AudioDevOps struct { /* setup the stream for playback but don't start it yet - * Note: currently SPICE only supports S16 samples so always assume so + * Note: the pull function returns f32 samples */ void (*setup)(int channels, int sampleRate, LG_AudioPullFn pullFn); diff --git a/client/src/audio.c b/client/src/audio.c index e8f2d1d9..cac1a6d3 100644 --- a/client/src/audio.c +++ b/client/src/audio.c @@ -26,6 +26,9 @@ #include "dynamic/audiodev.h" +#include +#include +#include #include typedef enum @@ -40,6 +43,45 @@ StreamState; #define STREAM_ACTIVE(state) \ (state == STREAM_STATE_SETUP || state == STREAM_STATE_RUN) +typedef struct +{ + int periodFrames; + double periodSec; + int64_t nextTime; + int64_t nextPosition; + double b; + double c; +} +PlaybackDeviceData; + +typedef struct +{ + float * framesIn; + float * framesOut; + int framesOutSize; + + int periodFrames; + double periodSec; + int64_t nextTime; + int64_t nextPosition; + double b; + double c; + + int64_t devLastTime; + int64_t devNextTime; + + int64_t devLastPosition; + int64_t devNextPosition; + + double offsetError; + double offsetErrorIntegral; + + double ratioIntegral; + + SRC_STATE * src; +} +PlaybackSpiceData; + typedef struct { struct LG_AudioDevOps * audioDev; @@ -50,13 +92,21 @@ typedef struct int volumeChannels; uint16_t volume[8]; bool mute; + int channels; int sampleRate; int stride; RingBuffer buffer; + RingBuffer deviceTiming; LG_Lock lock; RingBuffer timings; GraphHandle graph; + + // These two structs contain data specifically for use in the device and + // Spice data threads respectively. Keep them on separate cache lines to + // avoid false sharing + alignas(64) PlaybackDeviceData deviceData; + alignas(64) PlaybackSpiceData spiceData; } playback; @@ -75,6 +125,13 @@ AudioState; static AudioState audio = { 0 }; +typedef struct +{ + int64_t nextTime; + int64_t nextPosition; +} +PlaybackDeviceTick; + static void playbackStopNL(void); void audio_init(void) @@ -132,6 +189,16 @@ static void playbackStopNL(void) audio.playback.state = STREAM_STATE_STOP; audio.audioDev->playback.stop(); ringbuffer_free(&audio.playback.buffer); + ringbuffer_free(&audio.playback.deviceTiming); + audio.playback.spiceData.src = src_delete(audio.playback.spiceData.src); + + if (audio.playback.spiceData.framesIn) + { + free(audio.playback.spiceData.framesIn); + free(audio.playback.spiceData.framesOut); + audio.playback.spiceData.framesIn = NULL; + audio.playback.spiceData.framesOut = NULL; + } if (audio.playback.timings) { @@ -142,13 +209,68 @@ static void playbackStopNL(void) static int playbackPullFrames(uint8_t * dst, int frames) { + DEBUG_ASSERT(frames >= 0); + if (frames == 0) + return frames; + + PlaybackDeviceData * data = &audio.playback.deviceData; + int64_t now = nanotime(); + if (audio.playback.buffer) - frames = ringbuffer_consume(audio.playback.buffer, dst, frames); + { + // Measure the device clock and post to the Spice thread + if (frames != data->periodFrames) + { + bool init = data->periodFrames == 0; + if (init) + data->nextTime = now; + + data->periodFrames = frames; + data->periodSec = (double) frames / audio.playback.sampleRate; + data->nextTime += llrint(data->periodSec * 1.0e9); + data->nextPosition += frames; + + double bandwidth = 0.05; + double omega = 2.0 * M_PI * bandwidth * data->periodSec; + data->b = M_SQRT2 * omega; + data->c = omega * omega; + } + else + { + double error = (now - data->nextTime) * 1.0e-9; + if (fabs(error) >= 0.2) + { + // Clock error is too high; slew the read pointer and reset the timing + // parameters to avoid getting too far out of sync + int slewFrames = round(error * audio.playback.sampleRate); + ringbuffer_consume(audio.playback.buffer, NULL, slewFrames); + + data->periodSec = (double) frames / audio.playback.sampleRate; + data->nextTime = now + llrint(data->periodSec * 1.0e9); + data->nextPosition += slewFrames + frames; + } + else + { + data->nextTime += + llrint((data->b * error + data->periodSec) * 1.0e9); + data->periodSec += data->c * error; + data->nextPosition += frames; + } + } + + PlaybackDeviceTick tick = { + .nextTime = data->nextTime, + .nextPosition = data->nextPosition + }; + ringbuffer_append(audio.playback.deviceTiming, &tick, 1); + + ringbuffer_consume(audio.playback.buffer, dst, frames); + } else frames = 0; if (audio.playback.state == STREAM_STATE_DRAIN && - ringbuffer_getCount(audio.playback.buffer) == 0) + ringbuffer_getCount(audio.playback.buffer) <= 0) { LG_LOCK(audio.playback.lock); playbackStopNL(); @@ -172,18 +294,37 @@ void audio_playbackStart(int channels, int sampleRate, PSAudioFormat format, playbackStopNL(); } - // Using a bounded ring buffer for now. We are not currently doing anything to - // keep the input and output in sync, so if we were using an unbounded buffer, - // it would eventually end up in a state of permanent underrun or overrun and - // the user would only hear silence - const int bufferFrames = sampleRate; - audio.playback.buffer = ringbuffer_new(bufferFrames, - channels * sizeof(uint16_t)); + int srcError; + audio.playback.spiceData.src = + src_new(SRC_SINC_BEST_QUALITY, channels, &srcError); + if (!audio.playback.spiceData.src) + { + DEBUG_ERROR("Failed to create resampler: %s", src_strerror(srcError)); + goto done; + } + const int bufferFrames = sampleRate; + audio.playback.buffer = ringbuffer_newUnbounded(bufferFrames, + channels * sizeof(float)); + + audio.playback.deviceTiming = ringbuffer_new(16, sizeof(PlaybackDeviceTick)); + + audio.playback.channels = channels; audio.playback.sampleRate = sampleRate; - audio.playback.stride = channels * sizeof(uint16_t); + audio.playback.stride = channels * sizeof(float); audio.playback.state = STREAM_STATE_SETUP; + audio.playback.deviceData.periodFrames = 0; + audio.playback.deviceData.nextPosition = 0; + + audio.playback.spiceData.periodFrames = 0; + audio.playback.spiceData.nextPosition = 0; + audio.playback.spiceData.devLastTime = INT64_MIN; + audio.playback.spiceData.devNextTime = INT64_MIN; + audio.playback.spiceData.offsetError = 0.0; + audio.playback.spiceData.offsetErrorIntegral = 0.0; + audio.playback.spiceData.ratioIntegral = 0.0; + audio.audioDev->playback.setup(channels, sampleRate, playbackPullFrames); // if a volume level was stored, set it before we return @@ -203,6 +344,7 @@ void audio_playbackStart(int channels, int sampleRate, PSAudioFormat format, audio.playback.state = STREAM_STATE_SETUP; +done: LG_UNLOCK(audio.playback.lock); } @@ -246,14 +388,165 @@ void audio_playbackMute(bool mute) void audio_playbackData(uint8_t * data, size_t size) { - if (!audio.audioDev) + if (!audio.audioDev || size == 0) return; if (!STREAM_ACTIVE(audio.playback.state)) return; - int frames = size / audio.playback.stride; - ringbuffer_append(audio.playback.buffer, data, frames); + PlaybackSpiceData * spiceData = &audio.playback.spiceData; + int64_t now = nanotime(); + + // Convert from s16 to f32 samples + int spiceStride = audio.playback.channels * sizeof(int16_t); + int frames = size / spiceStride; + bool periodChanged = frames != spiceData->periodFrames; + bool init = spiceData->periodFrames == 0; + + if (periodChanged) + { + if (spiceData->framesIn) + { + free(spiceData->framesIn); + free(spiceData->framesOut); + } + spiceData->periodFrames = frames; + spiceData->framesIn = malloc(frames * audio.playback.stride); + + spiceData->framesOutSize = round(frames * 1.1); + spiceData->framesOut = + malloc(spiceData->framesOutSize * audio.playback.stride); + } + + src_short_to_float_array((int16_t *) data, spiceData->framesIn, + frames * audio.playback.channels); + + // Receive timing information from the audio device thread + PlaybackDeviceTick deviceTick; + while (ringbuffer_consume(audio.playback.deviceTiming, &deviceTick, 1)) + { + spiceData->devLastTime = spiceData->devNextTime; + spiceData->devLastPosition = spiceData->devNextPosition; + spiceData->devNextTime = deviceTick.nextTime; + spiceData->devNextPosition = deviceTick.nextPosition; + } + + // Measure the Spice audio clock + int64_t curTime; + int64_t curPosition; + if (periodChanged) + { + if (init) + spiceData->nextTime = now; + + curTime = spiceData->nextTime; + curPosition = spiceData->nextPosition; + + spiceData->periodSec = (double) frames / audio.playback.sampleRate; + spiceData->nextTime += llrint(spiceData->periodSec * 1.0e9); + + double bandwidth = 0.05; + double omega = 2.0 * M_PI * bandwidth * spiceData->periodSec; + spiceData->b = M_SQRT2 * omega; + spiceData->c = omega * omega; + } + else + { + double error = (now - spiceData->nextTime) * 1.0e-9; + if (fabs(error) >= 0.2) + { + // Clock error is too high; slew the write pointer and reset the timing + // parameters to avoid getting too far out of sync + int slewFrames = round(error * audio.playback.sampleRate); + ringbuffer_append(audio.playback.buffer, NULL, slewFrames); + + curTime = now; + curPosition = spiceData->nextPosition + slewFrames; + + spiceData->periodSec = (double) frames / audio.playback.sampleRate; + spiceData->nextTime = now + llrint(spiceData->periodSec * 1.0e9); + spiceData->nextPosition = curPosition; + } + else + { + curTime = spiceData->nextTime; + curPosition = spiceData->nextPosition; + + spiceData->nextTime += + llrint((spiceData->b * error + spiceData->periodSec) * 1.0e9); + spiceData->periodSec += spiceData->c * error; + } + } + + // Measure the offset between the Spice position and the device position, + // and how far away this is from the target latency. We use this to adjust + // the playback speed to bring them back in line. This value can change + // quite rapidly, particularly at the start of playback, so filter it to + // avoid sudden pitch shifts which will be noticeable to the user. + double offsetError = spiceData->offsetError; + if (spiceData->devLastTime != INT64_MIN) + { + // Interpolate to calculate the current device position + double devPosition = spiceData->devLastPosition + + (spiceData->devNextPosition - spiceData->devLastPosition) * + ((double) (curTime - spiceData->devLastTime) / + (spiceData->devNextTime - spiceData->devLastTime)); + + // Target latency derived experimentally to avoid underruns. This could be + // reduced with more tuning. We could adjust on the fly based upon the + // device period size, but that would result in underruns if the period size + // suddenly increases. It may be better instead to just reduce the maximum + // latency on the audio devices, which currently is set quite high + int targetLatencyMs = 70; + int targetLatencyFrames = + targetLatencyMs * audio.playback.sampleRate / 1000; + + double actualOffset = curPosition - devPosition; + double actualOffsetError = -(actualOffset - targetLatencyFrames); + + double error = actualOffsetError - offsetError; + spiceData->offsetError += spiceData->b * error + + spiceData->offsetErrorIntegral; + spiceData->offsetErrorIntegral += spiceData->c * error; + } + + // Resample the audio to adjust the playback speed. Use a PI controller to + // adjust the resampling ratio based upon the measured offset + double kp = 0.5e-6; + double ki = 1.0e-16; + + spiceData->ratioIntegral += offsetError * spiceData->periodSec; + + double piOutput = kp * offsetError + ki * spiceData->ratioIntegral; + double ratio = 1.0 + piOutput; + + int consumed = 0; + while (consumed < frames) + { + SRC_DATA srcData = { + .data_in = spiceData->framesIn + consumed * audio.playback.channels, + .data_out = spiceData->framesOut, + .input_frames = frames - consumed, + .output_frames = spiceData->framesOutSize, + .input_frames_used = 0, + .output_frames_gen = 0, + .end_of_input = 0, + .src_ratio = ratio + }; + + int error = src_process(spiceData->src, &srcData); + if (error) + { + DEBUG_ERROR("Resampling failed: %s", src_strerror(error)); + return; + } + + ringbuffer_append(audio.playback.buffer, spiceData->framesOut, + srcData.output_frames_gen); + + consumed += srcData.input_frames_used; + spiceData->nextPosition += srcData.output_frames_gen; + } if (audio.playback.state == STREAM_STATE_SETUP) {