X-Git-Url: https://code.delx.au/pulseaudio/blobdiff_plain/3cb8e0191c9e79a4d39f788b42449d12032b462c..ecf09f2cd6bf2f46b0d2402c700ab618c090bd4c:/src/pulsecore/svolume_sse.c diff --git a/src/pulsecore/svolume_sse.c b/src/pulsecore/svolume_sse.c index 5276bda4..86828072 100644 --- a/src/pulsecore/svolume_sse.c +++ b/src/pulsecore/svolume_sse.c @@ -25,6 +25,8 @@ #endif #include +#include + #include #include #include @@ -74,14 +76,17 @@ " por %%xmm4, "#s1" \n\t" /* .. | l h | */ \ " por %%xmm5, "#s2" \n\t" -static void -pa_volume_s16ne_sse2 (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) -{ + +static int channel_overread_table[8] = {8,8,8,12,8,10,12,14}; + +static void pa_volume_s16ne_sse2(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { pa_reg_x86 channel, temp; - /* the max number of samples we process at a time, this is also the max amount - * we overread the volume array, which should have enough padding. */ - channels = PA_MAX (8U, channels); + /* Channels must be at least 8 and always a multiple of the original number. + * This is also the max amount we overread the volume array, which should + * have enough padding. */ + if (channels < 8) + channels = channel_overread_table[channels]; __asm__ __volatile__ ( " xor %3, %3 \n\t" @@ -149,19 +154,19 @@ pa_volume_s16ne_sse2 (int16_t *samples, int32_t *volumes, unsigned channels, uns "8: \n\t" : "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp) - : "r" ((pa_reg_x86)channels) + : "rm" ((pa_reg_x86)channels) : "cc" ); } -static void -pa_volume_s16re_sse2 (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) -{ +static void pa_volume_s16re_sse2(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { pa_reg_x86 channel, temp; - /* the max number of samples we process at a time, this is also the max amount - * we overread the volume array, which should have enough padding. */ - channels = PA_MAX (8U, channels); + /* Channels must be at least 8 and always a multiple of the original number. + * This is also the max amount we overread the volume array, which should + * have enough padding. */ + if (channels < 8) + channels = channel_overread_table[channels]; __asm__ __volatile__ ( " xor %3, %3 \n\t" @@ -237,7 +242,7 @@ pa_volume_s16re_sse2 (int16_t *samples, int32_t *volumes, unsigned channels, uns "8: \n\t" : "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp) - : "r" ((pa_reg_x86)channels) + : "rm" ((pa_reg_x86)channels) : "cc" ); } @@ -246,11 +251,11 @@ pa_volume_s16re_sse2 (int16_t *samples, int32_t *volumes, unsigned channels, uns #ifdef RUN_TEST #define CHANNELS 2 -#define SAMPLES 1021 +#define SAMPLES 1022 #define TIMES 1000 #define PADDING 16 -static void run_test (void) { +static void run_test(void) { int16_t samples[SAMPLES]; int16_t samples_ref[SAMPLES]; int16_t samples_orig[SAMPLES]; @@ -259,21 +264,21 @@ static void run_test (void) { pa_do_volume_func_t func; pa_usec_t start, stop; - func = pa_get_volume_func (PA_SAMPLE_S16NE); + func = pa_get_volume_func(PA_SAMPLE_S16NE); - printf ("checking SSE %zd\n", sizeof (samples)); + printf("checking SSE2 %zd\n", sizeof(samples)); - pa_random (samples, sizeof (samples)); - memcpy (samples_ref, samples, sizeof (samples)); - memcpy (samples_orig, samples, sizeof (samples)); + pa_random(samples, sizeof(samples)); + memcpy(samples_ref, samples, sizeof(samples)); + memcpy(samples_orig, samples, sizeof(samples)); for (i = 0; i < CHANNELS; i++) - volumes[i] = rand() >> 1; + volumes[i] = PA_CLAMP_VOLUME(rand() >> 1); for (padding = 0; padding < PADDING; padding++, i++) volumes[i] = volumes[padding]; - func (samples_ref, volumes, CHANNELS, sizeof (samples)); - pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples)); + func(samples_ref, volumes, CHANNELS, sizeof(samples)); + pa_volume_s16ne_sse2(samples, volumes, CHANNELS, sizeof(samples)); for (i = 0; i < SAMPLES; i++) { if (samples[i] != samples_ref[i]) { printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i], @@ -283,35 +288,37 @@ static void run_test (void) { start = pa_rtclock_now(); for (j = 0; j < TIMES; j++) { - memcpy (samples, samples_orig, sizeof (samples)); - pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples)); + memcpy(samples, samples_orig, sizeof(samples)); + pa_volume_s16ne_sse2(samples, volumes, CHANNELS, sizeof(samples)); } stop = pa_rtclock_now(); pa_log_info("SSE: %llu usec.", (long long unsigned int)(stop - start)); start = pa_rtclock_now(); for (j = 0; j < TIMES; j++) { - memcpy (samples_ref, samples_orig, sizeof (samples)); - func (samples_ref, volumes, CHANNELS, sizeof (samples)); + memcpy(samples_ref, samples_orig, sizeof(samples)); + func(samples_ref, volumes, CHANNELS, sizeof (samples)); } stop = pa_rtclock_now(); pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); + + pa_assert_se(memcmp(samples_ref, samples, sizeof(samples)) == 0); } #endif #endif /* defined (__i386__) || defined (__amd64__) */ -void pa_volume_func_init_sse (pa_cpu_x86_flag_t flags) { +void pa_volume_func_init_sse(pa_cpu_x86_flag_t flags) { #if defined (__i386__) || defined (__amd64__) #ifdef RUN_TEST - run_test (); + run_test(); #endif if (flags & PA_CPU_X86_SSE2) { pa_log_info("Initialising SSE2 optimized functions."); - pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_sse2); - pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_sse2); + pa_set_volume_func(PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_sse2); + pa_set_volume_func(PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_sse2); } #endif /* defined (__i386__) || defined (__amd64__) */ }