mpg123: Revamped audio output logic for resampling and pitching.

Now the integration of the libsyn123 resampler and handling of the NtoM decoder are hopefully correct in the again-enlarged audio.c, which became a hollow shell with the advent of libout123, and now is strong again. Settling decoder and output formats has historically been the most confusing aspect of libmpg123 and I raise that with the integration of post-decoder resampling and pitching in the decoder, in the output device, or in said resampling between these two. Insanity! The new resampler is the default for forcing output rates now. If you wonder why, try this in the current mpg123 source tree with your fresh build: for resampler in ntom dirty fine do src/mpg123 -r 22040 --resample $resampler --pitch -0.72 \ src/tests/sweep.mp3 done With a pure sine sweep like that (generated via out123), you can appreciate a) the quality differences between resamplers, and b) how little those seem to matter when you just listen to music. Really, without a detailed comparison and some noisy pop/rock music, it is surprisingly hard to tell how bad the drop-sample resampling of the NtoM decoder really is. But this has a price: Even when I took great pains in designing the syn123 resampler, it needs more computing time than the MPEG decoder itself. That's life. You can make resamplers that are faster, but at the cost of increased latency which makes programming tedious. Here, I just ignored that aspect, as the syn123 resampling latency is just a handful of samples, well below the farts you get from ALSA on closing a device. git-svn-id: svn://scm.orgis.org/mpg123/trunk@4662 35dc7657-300d-0410-a2e5-dc2837fedb53
2025-10-23 16:48:31 +03:00 · 2020-04-26 16:33:28 +00:00
parent 01ea1fb42d
commit 8e8da9974c
4 changed files with 166 additions and 108 deletions
--- a/man1/mpg123.1
+++ b/man1/mpg123.1
@@ -254,13 +254,25 @@ change this if you need a constant bitrate independent of
 the mpeg stream rate. mpg123 automagically converts the
 rate. You should then combine this with \-\-stereo or \-\-mono.
 .TP
+\fB\-\^\-resample \fImethod
+Set resampling method to employ if forcing an output rate. Choices (case-insensitive) are NtoM,
+dirty, and fine. The fine resampler is the default. It employs libsyn123's low-latency fairly
+efficient resampler to postprocess the output from libmpg123 instead of the fast but very crude
+NtoM decoder (drop sample method) that mpg123 offers since decades. If you are really low on
+CPU time, choose NtoM, as the resampler usually needs more time than the MPEG decoder itself.
+The mpg123 program is smart enough to combine the 2to1 or 4to1 downsampling modes with the
+postprocessing for extreme downsampling.
+.TP
 .BR \-2 ", " \-\^\-2to1 "; " \-4 ", " \-\^\-4to1
-Performs a downsampling of ratio 2:1 (22 kHz) or 4:1 (11 kHz) 
-on the output stream, respectively. Saves some CPU cycles, but 
-at least the 4:1 ratio sounds ugly.
+Performs a downsampling of ratio 2:1 (22 kHz from 44.1 kHz) or 4:1 (11 kHz) 
+on the output stream, respectively. Saves some CPU cycles, but of course throws away
+the high frequencies, as the decoder does not bother producing them.
 .TP
 .BR \-\-pitch\ \fIvalue
-Set hardware pitch (speedup/down, 0 is neutral; 0.05 is 5%). This changes the output sampling rate, so it only works in the range your audio system/hardware supports.
+Set a pitch change (speedup/down, 0 is neutral; 0.05 is 5% speedup).  When not enforcing an
+output rate, this changes the output sampling rate, so it only works in the range your audio
+system/hardware supports. When you combine this with a fixed output rate, it modifies a
+software resampling ratio instead.
 .TP
 .BR \-\-8bit
 Forces 8bit output
--- a/src/audio.c
+++ b/src/audio.c
@@ -47,6 +47,7 @@
 static syn123_handle *sh = NULL;
 static struct mpg123_fmt outfmt = { .encoding=0, .rate=0, .channels=0 };
 static int outch = 0; // currently used number of output channels
+
 // A convoluted way to say outch*4, for semantic clarity.
 #define RESAMPLE_FRAMESIZE(ch) ((ch)*MPG123_SAMPLESIZE(MPG123_ENC_FLOAT_32))
 #define OUTPUT_FRAMESIZE(ch)   ((ch)*MPG123_SAMPLESIZE(outfmt.encoding))
@@ -61,20 +62,28 @@ static size_t resample_block = 0;
 // 1152*48/44.1*2*4 = 10032 ... let's go 16K.
 // This should work for final output data, too.
 // We'll loop over pieces if the buffer size is not enough for upsampling.
-static size_t resample_bytes = 1<<14;
+static size_t resample_bytes = 1<<16;
 int do_resample = 0;
 int do_resample_now = 0; // really apply resampler for current stream.

+/* Quick-shot paired table setup with remembering search in it.
+   this is for storing pairs of output sampling rate and decoding
+   sampling rate. */
+struct ratepair { long a; long b; };
+static long *outrates = NULL;
+static struct ratepair *unpitch = NULL;
+
+
 static int audio_capabilities(out123_handle *ao, mpg123_handle *mh);

+#define CLEAN_POINTER(p, func) if(p) func(p); p = NULL;
 void audio_cleanup(void)
 {
-	if(sh)
-		syn123_del(sh);
-	if(resample_outbuf)
-		free(resample_outbuf);
-	if(resample_buffer)
-		free(resample_buffer);
+	CLEAN_POINTER(outrates, free)
+	CLEAN_POINTER(unpitch, free)
+	CLEAN_POINTER(sh, syn123_del)
+	CLEAN_POINTER(resample_outbuf, free)
+	CLEAN_POINTER(resample_buffer, free)
 }

 int audio_setup(out123_handle *ao, mpg123_handle *mh)
@@ -96,8 +105,8 @@ int audio_setup(out123_handle *ao, mpg123_handle *mh)
 			merror("Cannot initialize syn123: %s\n", syn123_strerror(err));
 			return -1;
 		}
-		resample_buffer = malloc(resample_bytes);
-		resample_outbuf = malloc(resample_bytes);
+		resample_buffer = malloc(resample_bytes*10);
+		resample_outbuf = malloc(resample_bytes*10);
 		if(!resample_buffer || !resample_outbuf)
 			return -1;
 	}
@@ -106,9 +115,9 @@ int audio_setup(out123_handle *ao, mpg123_handle *mh)

 int audio_prepare(out123_handle *ao, long rate, int channels, int encoding)
 {
-	mdebug( "audio_prepare %ld Hz / %ld Hz, %i ch, enc %i"
-	,	rate, outfmt.rate, channels, encoding );
-	if(do_resample && rate == outfmt.rate)
+	mdebug( "audio_prepare %ld Hz / %ld Hz, %i ch, enc %s"
+	,	rate, outfmt.rate, channels, out123_enc_name(encoding) );
+	if(do_resample && param.pitch == 0. && rate == outfmt.rate)
 	{
 		do_resample_now = 0;
 		debug("disabled resampler for native rate");
@@ -116,11 +125,11 @@ int audio_prepare(out123_handle *ao, long rate, int channels, int encoding)
 	{
 		do_resample_now = 1;
 		// Smooth option could be considered once pitching is implemented with the
-		// resampler.The exiting state might fit the coming data if this is two
+		// resampler.The existing state might fit the coming data if this is two
 		// seamless tracks. If not, it's jut the first few samples that differ
 		// significantly depending on which data went through the resampler
 		// previously.
-		int err = syn123_setup_resample( sh, rate, outfmt.rate, channels
+		int err = syn123_setup_resample( sh, pitch_rate(rate), outfmt.rate, channels
 		,	(param.resample < 2), 0 );
 		if(err)
 		{
@@ -135,14 +144,17 @@ int audio_prepare(out123_handle *ao, long rate, int channels, int encoding)
 			?	RESAMPLE_FRAMESIZE(channels)
 			:	OUTPUT_FRAMESIZE(channels) );
 		// Minimum amount of input samples to fill the buffer.
-		resample_block = syn123_resample_fillcount(rate, outfmt.rate, frames);
+		resample_block = syn123_resample_fillcount(pitch_rate(rate), outfmt.rate, frames);
 		if(!resample_block)
 			return -1; // WTF? No comment.
-		mdebug("resampler setup %ld -> %ld, block %zu", rate, outfmt.rate, resample_block);
+		mdebug("resampler setup %ld -> %ld, block %zu", pitch_rate(rate), outfmt.rate, resample_block);
 		rate     = outfmt.rate;
 		encoding = outfmt.encoding;
-	}
-	return out123_start(ao, pitch_rate(rate), channels, encoding);
+	} else if(outfmt.rate)
+		rate = outfmt.rate; // That's pitching with NtoM.
+	else
+		rate = pitch_rate(rate); // That's plain hardware pitching.
+	return out123_start(ao, rate, channels, encoding);
 }

 // Loop over blocks with the resampler, think about intflag.
@@ -216,7 +228,7 @@ static void capline(mpg123_handle *mh, long rate, struct mpg123_fmt *outfmt)
 	const int  *encs;
 	size_t      num_encs;
 	mpg123_encodings(&encs, &num_encs);
-	fprintf(stderr," %5ld |", pitch_rate(outfmt ? outfmt->rate : rate));
+	fprintf(stderr," %5ld |", outfmt ? outfmt->rate : rate);
 	for(enci=0; enci<num_encs; ++enci)
 	{
 		int fmt = outfmt
@@ -272,22 +284,29 @@ void print_capabilities(out123_handle *ao, mpg123_handle *mh)
 		if(do_resample)
 			capline(mh, 0, &outfmt);
 		else
-			capline(mh, param.force_rate, NULL);
+			capline(mh,  bpitch_rate(param.force_rate), NULL);
 	}
 	fprintf(stderr,"\n");
 	if(do_resample)
+	{
+		if(param.pitch != 0.)
+			fprintf( stderr, "Resampler with pitch: %g\n"
+			,	param.pitch );
+		else
+			fprintf(stderr, "Resampler configured.\n");
 		fprintf( stderr, "%s\n%s\n"
-		,	"Resampler configured. Decoding to f32 as intermediate if needed."
+		,	"Decoding to f32 as intermediate if needed."
 		,	"Resampler output format is in the last line." );
+	}
 	else if(param.force_rate)
-		fprintf( stderr, "%s\n"
-		,	"Decoder rate forced. Resulting format support shown in last line." );
+		fprintf( stderr
+		,	"%s rate forced. Resulting format support shown in last line.\n"
+		,	param.pitch != 0. ? "Pitched decoder" : "Decoder" );
+	else if(param.pitch != 0.)
+		fprintf( stderr, "Actual output rates adjusted by pitch value %g.\n"
+		,	param.pitch );
 }

-/* Quick-shot paired table setup with remembering search in it.
-   this is for storing pairs of output sampling rate and decoding
-   sampling rate. */
-struct ratepair { long a; long b; };

 long brate(struct ratepair *table, long arate, int count, int *last)
 {
@@ -330,12 +349,12 @@ static int audio_capabilities(out123_handle *ao, mpg123_handle *mh)
 	/* Pitching introduces a difference between decoder rate and playback rate. */
 	long decode_rate;
 	const long *rates;
-	long *outrates;
-	struct ratepair *unpitch;
 	struct mpg123_fmt *outfmts = NULL;
 	int fmtcount;
 	size_t num_rates, rlimit;
-	long ntom_rate = do_resample ? 0 : param.force_rate;
+	if(param.pitch < -0.99)
+		param.pitch = -0.99;
+	long ntom_rate = do_resample ? 0 : bpitch_rate(param.force_rate);
 	outfmt.rate = param.force_rate;
 	outfmt.channels = 0;
 	outfmt.encoding = 0;
@@ -371,14 +390,18 @@ static int audio_capabilities(out123_handle *ao, mpg123_handle *mh)
 			,	(unsigned)force_fmt, out123_enc_name(force_fmt));
 	}

+	// A possible optimization for resampling mode is to keep existing output
+	// format support configured and don't even interrupt the output device at
+	// all. If you change pitch, you just change a number for the resampler.
+	// But currently, the idea of re-opening the output device on format
+	// changes is rather ingrained in mpg123.
+
 	if(do_resample)
 	{
-		if(param.pitch != 0)
-			fprintf(stderr, "WARNING: interaction of pitch and resampler not yet settled\n");
 		// If really doing the extra resampling, output will always run with
 		// this setup, regardless of decoder.
-		int enc1 = out123_encodings(ao, param.force_rate, 1);
-		int enc2 = out123_encodings(ao, param.force_rate, 2);
+		int enc1 = out123_encodings(ao, outfmt.rate, 1);
+		int enc2 = out123_encodings(ao, outfmt.rate, 2);
 		if(force_fmt)
 		{
 			enc1 &= force_fmt;
@@ -431,27 +454,80 @@ static int audio_capabilities(out123_handle *ao, mpg123_handle *mh)
 				error("Perhaps your forced output encoding is not supported.");
 			return -1;
 		}
+		const char *encname = out123_enc_name(outfmt.encoding);
+		if(param.verbose > 1)
+			for(int ch=MPG123_MONO; ch<=MPG123_STEREO; ++ch)
+				if(outfmt.channels & ch)
+					fprintf(stderr, "Note: output format %li Hz, %s, %s\n"
+					,	outfmt.rate, ch==MPG123_MONO ? "mono" : "stereo"
+					,	encname ? encname : "???" );
 	}

-	/* Lots of preparation of rate lists. */
-	rlimit = ntom_rate > 0 ? num_rates+1 : num_rates;
-	outrates = malloc(sizeof(*rates)*rlimit);
-	unpitch  = malloc(sizeof(*unpitch)*rlimit);
+	// Either enable or disable rate forcing, whith ntom_rate non-zero or not.
+	if(mpg123_param(mh, MPG123_FORCE_RATE, ntom_rate, 0) != MPG123_OK)
+	{
+		merror("Cannot force NtoM rate: %s", mpg123_strerror(mh));
+		return -1;
+	}
+
+	if(ntom_rate)
+	{
+		// Only that one rate is enforced. Nothing else needs to be checked.
+		// For pitching, ntom_rate has been adjusted. The output uses outfmt.rate.
+		// Need to tell mpg123 about the forced rate to make it work.
+		for(int ch=1; ch<=2; ++ch)
+		{
+			int fmts = out123_encodings(ao, outfmt.rate, ch);
+			if(param.verbose > 2)
+				fprintf( stderr
+				,	"Note: output support for %li Hz, %s: 0x%x\n"
+				,	outfmt.rate, ch==MPG123_MONO ? "mono" : "stereo", fmts );
+			if(force_fmt)
+				fmts = ((fmts & force_fmt) == force_fmt) ? force_fmt : 0;
+			mpg123_format(mh, ntom_rate, ch, fmts);
+		}
+	} else if(do_resample)
+	{
+		// Support any decoding rate with float output for the resampler and also
+		// direct decoding to confiugred output format.
+		// One twist: Disable high rates with signal that the resampler will throw
+		// away anyway. This includes pitch. 22040 Hz output rate with pitch 0.5
+		// still wants the full 44100 Hz input data, as original signal up to
+		// 22040 Hz will be heard as up to 11020 Hz. So we want pitch_rate()
+		// to be above outfmt.rate. Final resampling ratio not above 2.
+		for(ri=0; ri<num_rates; ++ri)
+		{
+			if(rates[ri] > 12000 && pitch_rate(rates[ri]) > outfmt.rate*2)
+				break;
+			int fmt = (param.pitch == 0. && rates[ri] == outfmt.rate)
+			? outfmt.encoding
+			: MPG123_ENC_FLOAT_32;
+			mpg123_format(mh, rates[ri], outfmt.channels, fmt);
+		}
+	} else
+	{
+		// Finally, the old style, direct decoding to possibly pitched output.
+		if(!outrates)
+			outrates = malloc(sizeof(*rates)*num_rates);
+		if(!unpitch)
+			unpitch  = malloc(sizeof(*unpitch)*num_rates);
 		if(!outrates || !unpitch)
 		{
+			CLEAN_POINTER(outrates, free)
+			CLEAN_POINTER(unpitch, free)
 			error("DOOM");
 			return -1;
 		}
-	for(ri = 0; ri<rlimit; ri++)
+		for(ri = 0; ri<num_rates; ri++)
 		{
-		decode_rate = ri < num_rates ? rates[ri] : ntom_rate;
+			decode_rate   = rates[ri];
 			outrates[ri]  = pitch_rate(decode_rate);
 			unpitch[ri].a = outrates[ri];
 			unpitch[ri].b = decode_rate;
 		}
 		/* Actually query formats possible with given rates. */
-	fmtcount = out123_formats(ao, outrates, rlimit, 1, 2, &outfmts);
-	free(outrates);
+		fmtcount = out123_formats(ao, outrates, num_rates, 1, 2, &outfmts);
+		// Remember: First one is a default format, then come my rates.
 		if(fmtcount > 0)
 		{
 			int fi;
@@ -471,41 +547,13 @@ static int audio_capabilities(out123_handle *ao, mpg123_handle *mh)
 					,	"Note: output support for %li Hz, %i channels: 0x%x\n"
 					,	outfmts[fi].rate, outfmts[fi].channels, outfmts[fi].encoding );
 				if(force_fmt)
-			{ /* Filter for forced encoding. */
-				if((fmts & force_fmt) == force_fmt)
-					fmts = force_fmt;
-				else /* Nothing else! */
-					fmts = 0;
-			}
-			// Support the resampler or native playback. Condition for the resampler
-			// to work is decoding to float and keeping a channel count compatible
-			// with configured output (in a case that might differ for various encodings).
-			long decode_rate = brate(unpitch, outfmts[fi].rate, rlimit, &unpitch_i);
-			if(do_resample && decode_rate != outfmt.rate)
-			{
-				fmts = 0;
-				// Only enable float outupt for resampler if needed and channel
-				// count supported for real output format.
-				if((outfmts[fi].channels & outfmt.channels) == outfmts[fi].channels)
-					fmts = MPG123_ENC_FLOAT_32;
-				// Also, be smart and let the internal downsampling work for small output
-				// rates. If target is 22050, decoding to 44100 and 48000 is not sensible,
-				// rather do 22050 or 24000. We have a factor of 4 to play with.
-				// So any input file, with max rate of 48000, can be decoded down to
-				// 12000 at least, actually saving computing time, if not in the decoder,
-				// then in the resampler.
-				// Every rate above 12000 can be halved to still get a valid MPEG rate.
-				// Output of 12001 Hz needs decoding to 16000 up to 23999, but not more.
-				// My resampler does not care about very small resampling steps; the less
-				// samples, the less work.
-				if(decode_rate > 12000 && decode_rate > outfmt.rate*2)
-					fmts = 0;
-			}
+					fmts = ((fmts & force_fmt) == force_fmt) ? force_fmt : 0;
+				decode_rate = brate(unpitch, outfmts[fi].rate, num_rates, &unpitch_i);
 				mpg123_format(mh, decode_rate, outfmts[fi].channels, fmts);
 			}
 		}
 		free(outfmts);
-	free(unpitch);
+	}

 	if(param.verbose > 1) print_capabilities(ao, mh);

@@ -534,7 +582,6 @@ int set_pitch(mpg123_handle *fr, out123_handle *ao, double new_pitch)
 	}

 	param.pitch = new_pitch;
-	if(param.pitch < -0.99) param.pitch = -0.99;

 	if(channels == 1) smode = MPG123_MONO;
 	if(channels == 2) smode = MPG123_STEREO;
@@ -552,7 +599,7 @@ int set_pitch(mpg123_handle *fr, out123_handle *ao, double new_pitch)
 		param.pitch = old_pitch;
 		audio_capabilities(ao, fr);
 	}
-	return out123_start(ao, pitch_rate(rate), channels, format);
+	return audio_prepare(ao, rate, channels, format);
 }

 int set_mute(out123_handle *ao, int mutestate)
--- a/src/audio.h
+++ b/src/audio.h
@@ -21,6 +21,8 @@
 #include "out123.h"

 #define pitch_rate(rate)	(param.pitch == 0 ? (rate) : (long) ((param.pitch+1.0)*(rate)))
+#define bpitch_rate(rate)	(param.pitch == 0 ? (rate) : (long) ((rate)/(param.pitch+1.0)))
+

 void audio_cleanup(void);
 int audio_setup(out123_handle *ao, mpg123_handle *mh);
--- a/src/mpg123.c
+++ b/src/mpg123.c
@@ -1124,7 +1124,6 @@ int main(int sys_argc, char ** sys_argv)
 	if(dnow != 0) param.cpu = (dnow == SET_3DNOW) ? "3dnow" : "i586";
 #endif
 	if(param.cpu != NULL && (!strcmp(param.cpu, "auto") || !strcmp(param.cpu, ""))) param.cpu = NULL;
-	long ntom_rate = param.resample ? 0 : param.force_rate;
 	if(!(  MPG123_OK == (result = mpg123_par(mp, MPG123_VERBOSE, param.verbose, 0))
 	    && ++libpar
 	    && MPG123_OK == (result = mpg123_par(mp, MPG123_FLAGS, param.flags, 0))
@@ -1133,8 +1132,6 @@ int main(int sys_argc, char ** sys_argv)
 	    && ++libpar
 	    && MPG123_OK == (result = mpg123_par(mp, MPG123_RVA, param.rva, 0))
 	    && ++libpar
-	    && MPG123_OK == (result = mpg123_par(mp, MPG123_FORCE_RATE, ntom_rate, 0))
-	    && ++libpar
 	    && MPG123_OK == (result = mpg123_par(mp, MPG123_DOWNSPEED, param.halfspeed, 0))
 	    && ++libpar
 	    && MPG123_OK == (result = mpg123_par(mp, MPG123_UPSPEED, param.doublespeed, 0))
@@ -1632,11 +1629,11 @@ static void long_usage(int err)
 	fprintf(o," -m     --mono --mix       mix stereo to mono\n");
 	fprintf(o,"        --stereo           duplicate mono channel\n");
 	fprintf(o," -r     --rate             force a specific audio output rate\n");
-	fprintf(o,"        --resample         choose resampling mode for forced rate:\n"
+	fprintf(o,"        --resample <s>     choose resampling mode for forced rate:\n"
 	          "                           NtoM, dirty, fine (default)\n");
 	fprintf(o," -2     --2to1             2:1 downsampling\n");
 	fprintf(o," -4     --4to1             4:1 downsampling\n");
-  fprintf(o,"        --pitch <value>    set hardware pitch (speedup/down, 0 is neutral; 0.05 is 5%%)\n");
+	fprintf(o,"        --pitch <value>    set pitch (speedup/down, 0 is neutral; 0.05 is 5%%)\n");
 	fprintf(o,"        --8bit             force 8 bit output\n");
 	fprintf(o,"        --float            force floating point output (internal precision)\n");
 	fprintf(o," -e <c> --encoding <c>     force a specific encoding (%s)\n"