2 This file is part of PulseAudio.
4 Copyright 2013 Peter Meerwald <pmeerw@pmeerw.net>
6 PulseAudio is free software; you can redistribute it and/or modify
7 it under the terms of the GNU Lesser General Public License as published
8 by the Free Software Foundation; either version 2.1 of the License,
9 or (at your option) any later version.
11 PulseAudio is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
21 #include <pulsecore/macro.h>
22 #include <pulsecore/endianmacros.h>
23 #include <pulsecore/sample-util.h>
30 static pa_do_mix_func_t fallback
;
32 /* special case: mix s16ne streams, 2 channels each */
33 static void pa_mix_ch2_s16ne_neon(pa_mix_info streams
[], unsigned nstreams
, uint8_t *data
, unsigned length
) {
34 const unsigned mask
= sizeof(int16_t) * 8 - 1;
35 const uint8_t *end
= data
+ (length
& ~mask
);
41 __asm__
__volatile__ (
42 "veor.s32 %q[sum0], %q[sum0] \n\t"
43 "veor.s32 %q[sum1], %q[sum1] \n\t"
44 : [sum0
] "=w" (sum0
), [sum1
] "=w" (sum1
)
46 : "cc" /* clobber list */
49 for (i
= 0; i
< nstreams
; i
++) {
50 pa_mix_info
*m
= streams
+ i
;
51 int32_t cv0
= m
->linear
[0].i
;
52 int32_t cv1
= m
->linear
[1].i
;
54 __asm__
__volatile__ (
55 "vld2.s16 {d0,d2}, [%[ptr]]! \n\t"
56 "vmov.s32 d4[0], %[cv0] \n\t"
57 "vmov.s32 d4[1], %[cv1] \n\t"
58 "vshll.s16 q0, d0, #15 \n\t"
59 "vshll.s16 q1, d2, #15 \n\t"
60 "vqdmulh.s32 q0, q0, d4[0] \n\t"
61 "vqdmulh.s32 q1, q1, d4[1] \n\t"
62 "vqadd.s32 %q[sum0], %q[sum0], q0 \n\t"
63 "vqadd.s32 %q[sum1], %q[sum1], q1 \n\t"
64 : [ptr
] "+r" (m
->ptr
), [sum0
] "+w" (sum0
), [sum1
] "+w" (sum1
)
65 : [cv0
] "r" (cv0
), [cv1
] "r" (cv1
)
66 : "memory", "cc", "q0", "q1", "d4" /* clobber list */
70 __asm__
__volatile__ (
71 "vqmovn.s32 d0, %q[sum0] \n\t"
72 "vqmovn.s32 d1, %q[sum1] \n\t"
73 "vst2.s16 {d0,d1}, [%[data]]! \n\t"
75 : [sum0
] "w" (sum0
), [sum1
] "w" (sum1
)
76 : "memory", "cc", "q0" /* clobber list */
80 fallback(streams
, nstreams
, 2, data
, length
& mask
);
83 /* special case: mix 2 s16ne streams, 1 channel each */
84 static void pa_mix2_ch1_s16ne_neon(pa_mix_info streams
[], int16_t *data
, unsigned length
) {
85 const int16_t *ptr0
= streams
[0].ptr
;
86 const int16_t *ptr1
= streams
[1].ptr
;
89 __asm__
__volatile__ (
90 "vdup.s32 %q[sv0], %[lin0] \n\t"
91 "vdup.s32 %q[sv1], %[lin1] \n\t"
92 : [sv0
] "=w" (sv0
), [sv1
] "=w" (sv1
)
93 : [lin0
] "r" (streams
[0].linear
[0]), [lin1
] "r" (streams
[1].linear
[0])
97 length
/= sizeof(int16_t);
98 for (; length
>= 4; length
-= 4) {
99 __asm__
__volatile__ (
100 "vld1.s16 d0, [%[ptr0]]! \n\t"
101 "vld1.s16 d2, [%[ptr1]]! \n\t"
102 "vshll.s16 q0, d0, #15 \n\t"
103 "vshll.s16 q1, d2, #15 \n\t"
104 "vqdmulh.s32 q0, q0, %q[sv0] \n\t"
105 "vqdmulh.s32 q1, q1, %q[sv1] \n\t"
106 "vqadd.s32 q0, q0, q1 \n\t"
107 "vqmovn.s32 d0, q0 \n\t"
108 "vst1.s16 d0, [%[data]]! \n\t"
109 : [ptr0
] "+r" (ptr0
), [ptr1
] "+r" (ptr1
), [data
] "+r" (data
)
110 : [sv0
] "w" (sv0
), [sv1
] "w" (sv1
)
111 : "memory", "cc", "q0", "q1" /* clobber list */
115 for (; length
> 0; length
--) {
116 int32_t sum
= pa_mult_s16_volume(*ptr0
++, streams
[0].linear
[0].i
);
117 sum
+= pa_mult_s16_volume(*ptr1
++, streams
[1].linear
[0].i
);
118 *data
++ = PA_CLAMP_UNLIKELY(sum
, -0x8000, 0x7FFF);
122 /* special case: mix 2 s16ne streams, 2 channel each */
123 static void pa_mix2_ch2_s16ne_neon(pa_mix_info streams
[], int16_t *data
, unsigned length
) {
124 const int16_t *ptr0
= streams
[0].ptr
;
125 const int16_t *ptr1
= streams
[1].ptr
;
128 __asm__
__volatile__ (
129 "vld1.s32 d0, [%[lin0]] \n\t"
130 "vmov.s32 d1, d0 \n\t"
131 "vmov.s32 %q[sv0], q0 \n\t"
132 "vld1.s32 d0, [%[lin1]] \n\t"
133 "vmov.s32 d1, d0 \n\t"
134 "vmov.s32 %q[sv1], q0 \n\t"
135 : [sv0
] "=w" (sv0
), [sv1
] "=w" (sv1
)
136 : [lin0
] "r" (streams
[0].linear
), [lin1
] "r" (streams
[1].linear
)
137 : "q0" /* clobber list */
140 length
/= sizeof(int16_t);
141 for (; length
>= 4; length
-= 4) {
142 __asm__
__volatile__ (
143 "vld1.s16 d0, [%[ptr0]]! \n\t"
144 "vld1.s16 d2, [%[ptr1]]! \n\t"
145 "vshll.s16 q0, d0, #15 \n\t"
146 "vshll.s16 q1, d2, #15 \n\t"
147 "vqdmulh.s32 q0, q0, %q[sv0] \n\t"
148 "vqdmulh.s32 q1, q1, %q[sv1] \n\t"
149 "vqadd.s32 q0, q0, q1 \n\t"
150 "vqmovn.s32 d0, q0 \n\t"
151 "vst1.s16 d0, [%[data]]! \n\t"
152 : [ptr0
] "+r" (ptr0
), [ptr1
] "+r" (ptr1
), [data
] "+r" (data
)
153 : [sv0
] "w" (sv0
), [sv1
] "w" (sv1
)
154 : "memory", "cc", "q0", "q1" /* clobber list */
161 sum
= pa_mult_s16_volume(*ptr0
++, streams
[0].linear
[0].i
);
162 sum
+= pa_mult_s16_volume(*ptr1
++, streams
[1].linear
[0].i
);
163 *data
++ = PA_CLAMP_UNLIKELY(sum
, -0x8000, 0x7FFF);
165 sum
= pa_mult_s16_volume(*ptr0
++, streams
[0].linear
[1].i
);
166 sum
+= pa_mult_s16_volume(*ptr1
++, streams
[1].linear
[1].i
);
167 *data
++ = PA_CLAMP_UNLIKELY(sum
, -0x8000, 0x7FFF);
171 /* special case: mix 2 s16ne streams, 4 channels each */
172 static void pa_mix2_ch4_s16ne_neon(pa_mix_info streams
[], int16_t *data
, unsigned length
) {
173 const int16_t *ptr0
= streams
[0].ptr
;
174 const int16_t *ptr1
= streams
[1].ptr
;
178 __asm__
__volatile__ (
179 "vld1.s32 %h[sv0], [%[lin0]] \n\t"
180 "vld1.s32 %h[sv1], [%[lin1]] \n\t"
181 : [sv0
] "=w" (sv0
), [sv1
] "=w" (sv1
)
182 : [lin0
] "r" (streams
[0].linear
), [lin1
] "r" (streams
[1].linear
)
186 length
/= sizeof(int16_t);
187 for (; length
>= 4; length
-= 4) {
188 __asm__
__volatile__ (
189 "vld1.s16 d0, [%[ptr0]]! \n\t"
190 "vld1.s16 d2, [%[ptr1]]! \n\t"
191 "vshll.s16 q0, d0, #15 \n\t"
192 "vshll.s16 q1, d2, #15 \n\t"
193 "vqdmulh.s32 q0, q0, %q[sv0] \n\t"
194 "vqdmulh.s32 q1, q1, %q[sv1] \n\t"
195 "vqadd.s32 q0, q0, q1 \n\t"
196 "vqmovn.s32 d0, q0 \n\t"
197 "vst1.s16 d0, [%[data]]! \n\t"
198 : [ptr0
] "+r" (ptr0
), [ptr1
] "+r" (ptr1
), [data
] "+r" (data
)
199 : [sv0
] "w" (sv0
), [sv1
] "w" (sv1
)
200 : "memory", "cc", "q0", "q1" /* clobber list */
205 static void pa_mix_s16ne_neon(pa_mix_info streams
[], unsigned nstreams
, unsigned nchannels
, void *data
, unsigned length
) {
206 if (nstreams
== 2 && nchannels
== 2)
207 pa_mix2_ch2_s16ne_neon(streams
, data
, length
);
208 else if (nstreams
== 2 && nchannels
== 4)
209 pa_mix2_ch4_s16ne_neon(streams
, data
, length
);
210 else if (nstreams
== 2 && nchannels
== 1)
211 pa_mix2_ch1_s16ne_neon(streams
, data
, length
);
212 else if (nchannels
== 2)
213 pa_mix_ch2_s16ne_neon(streams
, nstreams
, data
, length
);
215 fallback(streams
, nstreams
, nchannels
, data
, length
);
218 void pa_mix_func_init_neon(pa_cpu_arm_flag_t flags
) {
219 pa_log_info("Initialising ARM NEON optimized mixing functions.");
221 fallback
= pa_get_mix_func(PA_SAMPLE_S16NE
);
222 pa_set_mix_func(PA_SAMPLE_S16NE
, (pa_do_mix_func_t
) pa_mix_s16ne_neon
);