" por %%xmm4, "#s1" \n\t" /* .. | l h | */ \
" por %%xmm5, "#s2" \n\t"
-static void
-pa_volume_s16ne_sse2 (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
-{
+
+static int channel_overread_table[8] = {8,8,8,12,8,10,12,14};
+
+static void pa_volume_s16ne_sse2(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
pa_reg_x86 channel, temp;
- /* the max number of samples we process at a time, this is also the max amount
- * we overread the volume array, which should have enough padding. */
- channels = PA_MAX (8U, channels);
+ /* Channels must be at least 8 and always a multiple of the original number.
+ * This is also the max amount we overread the volume array, which should
+ * have enough padding. */
+ if (channels < 8)
+ channels = channel_overread_table[channels];
__asm__ __volatile__ (
" xor %3, %3 \n\t"
);
}
-static void
-pa_volume_s16re_sse2 (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
-{
+static void pa_volume_s16re_sse2(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
pa_reg_x86 channel, temp;
- /* the max number of samples we process at a time, this is also the max amount
- * we overread the volume array, which should have enough padding. */
- channels = PA_MAX (8U, channels);
+ /* Channels must be at least 8 and always a multiple of the original number.
+ * This is also the max amount we overread the volume array, which should
+ * have enough padding. */
+ if (channels < 8)
+ channels = channel_overread_table[channels];
__asm__ __volatile__ (
" xor %3, %3 \n\t"
#ifdef RUN_TEST
#define CHANNELS 2
-#define SAMPLES 1021
+#define SAMPLES 1022
#define TIMES 1000
#define PADDING 16
-static void run_test (void) {
+static void run_test(void) {
int16_t samples[SAMPLES];
int16_t samples_ref[SAMPLES];
int16_t samples_orig[SAMPLES];
pa_do_volume_func_t func;
pa_usec_t start, stop;
- func = pa_get_volume_func (PA_SAMPLE_S16NE);
+ func = pa_get_volume_func(PA_SAMPLE_S16NE);
- printf ("checking SSE2 %zd\n", sizeof (samples));
+ printf("checking SSE2 %zd\n", sizeof(samples));
- pa_random (samples, sizeof (samples));
- memcpy (samples_ref, samples, sizeof (samples));
- memcpy (samples_orig, samples, sizeof (samples));
+ pa_random(samples, sizeof(samples));
+ memcpy(samples_ref, samples, sizeof(samples));
+ memcpy(samples_orig, samples, sizeof(samples));
for (i = 0; i < CHANNELS; i++)
- volumes[i] = rand() >> 1;
+ volumes[i] = PA_CLAMP_VOLUME(rand() >> 1);
for (padding = 0; padding < PADDING; padding++, i++)
volumes[i] = volumes[padding];
- func (samples_ref, volumes, CHANNELS, sizeof (samples));
- pa_volume_s16ne_sse2 (samples, volumes, CHANNELS, sizeof (samples));
+ func(samples_ref, volumes, CHANNELS, sizeof(samples));
+ pa_volume_s16ne_sse2(samples, volumes, CHANNELS, sizeof(samples));
for (i = 0; i < SAMPLES; i++) {
if (samples[i] != samples_ref[i]) {
printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i],
start = pa_rtclock_now();
for (j = 0; j < TIMES; j++) {
- memcpy (samples, samples_orig, sizeof (samples));
- pa_volume_s16ne_sse2 (samples, volumes, CHANNELS, sizeof (samples));
+ memcpy(samples, samples_orig, sizeof(samples));
+ pa_volume_s16ne_sse2(samples, volumes, CHANNELS, sizeof(samples));
}
stop = pa_rtclock_now();
pa_log_info("SSE: %llu usec.", (long long unsigned int)(stop - start));
start = pa_rtclock_now();
for (j = 0; j < TIMES; j++) {
- memcpy (samples_ref, samples_orig, sizeof (samples));
- func (samples_ref, volumes, CHANNELS, sizeof (samples));
+ memcpy(samples_ref, samples_orig, sizeof(samples));
+ func(samples_ref, volumes, CHANNELS, sizeof (samples));
}
stop = pa_rtclock_now();
pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
#endif
#endif /* defined (__i386__) || defined (__amd64__) */
-void pa_volume_func_init_sse (pa_cpu_x86_flag_t flags) {
+void pa_volume_func_init_sse(pa_cpu_x86_flag_t flags) {
#if defined (__i386__) || defined (__amd64__)
#ifdef RUN_TEST
- run_test ();
+ run_test();
#endif
if (flags & PA_CPU_X86_SSE2) {
pa_log_info("Initialising SSE2 optimized functions.");
- pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_sse2);
- pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_sse2);
+ pa_set_volume_func(PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_sse2);
+ pa_set_volume_func(PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_sse2);
}
#endif /* defined (__i386__) || defined (__amd64__) */
}