]> code.delx.au - pulseaudio/blob - src/modules/bluetooth/sbc/sbc_primitives_neon.c
5d4d0e338d5fb6448fb278358e267f042973b535
[pulseaudio] / src / modules / bluetooth / sbc / sbc_primitives_neon.c
1 /*
2 *
3 * Bluetooth low-complexity, subband codec (SBC) library
4 *
5 * Copyright (C) 2008-2010 Nokia Corporation
6 * Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org>
7 * Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch>
8 * Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com>
9 *
10 *
11 * This library is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
15 *
16 * This library is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with this library; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 *
25 */
26
27 #include <stdint.h>
28 #include <limits.h>
29 #include "sbc.h"
30 #include "sbc_math.h"
31 #include "sbc_tables.h"
32
33 #include "sbc_primitives_neon.h"
34
35 /*
36 * ARM NEON optimizations
37 */
38
39 #ifdef SBC_BUILD_WITH_NEON_SUPPORT
40
41 static inline void _sbc_analyze_four_neon(const int16_t *in, int32_t *out,
42 const FIXED_T *consts)
43 {
44 /* TODO: merge even and odd cases (or even merge all four calls to this
45 * function) in order to have only aligned reads from 'in' array
46 * and reduce number of load instructions */
47 __asm__ volatile (
48 "vld1.16 {d4, d5}, [%0, :64]!\n"
49 "vld1.16 {d8, d9}, [%1, :128]!\n"
50
51 "vmull.s16 q0, d4, d8\n"
52 "vld1.16 {d6, d7}, [%0, :64]!\n"
53 "vmull.s16 q1, d5, d9\n"
54 "vld1.16 {d10, d11}, [%1, :128]!\n"
55
56 "vmlal.s16 q0, d6, d10\n"
57 "vld1.16 {d4, d5}, [%0, :64]!\n"
58 "vmlal.s16 q1, d7, d11\n"
59 "vld1.16 {d8, d9}, [%1, :128]!\n"
60
61 "vmlal.s16 q0, d4, d8\n"
62 "vld1.16 {d6, d7}, [%0, :64]!\n"
63 "vmlal.s16 q1, d5, d9\n"
64 "vld1.16 {d10, d11}, [%1, :128]!\n"
65
66 "vmlal.s16 q0, d6, d10\n"
67 "vld1.16 {d4, d5}, [%0, :64]!\n"
68 "vmlal.s16 q1, d7, d11\n"
69 "vld1.16 {d8, d9}, [%1, :128]!\n"
70
71 "vmlal.s16 q0, d4, d8\n"
72 "vmlal.s16 q1, d5, d9\n"
73
74 "vpadd.s32 d0, d0, d1\n"
75 "vpadd.s32 d1, d2, d3\n"
76
77 "vrshrn.s32 d0, q0, %3\n"
78
79 "vld1.16 {d2, d3, d4, d5}, [%1, :128]!\n"
80
81 "vdup.i32 d1, d0[1]\n" /* TODO: can be eliminated */
82 "vdup.i32 d0, d0[0]\n" /* TODO: can be eliminated */
83
84 "vmull.s16 q3, d2, d0\n"
85 "vmull.s16 q4, d3, d0\n"
86 "vmlal.s16 q3, d4, d1\n"
87 "vmlal.s16 q4, d5, d1\n"
88
89 "vpadd.s32 d0, d6, d7\n" /* TODO: can be eliminated */
90 "vpadd.s32 d1, d8, d9\n" /* TODO: can be eliminated */
91
92 "vst1.32 {d0, d1}, [%2, :128]\n"
93 : "+r" (in), "+r" (consts)
94 : "r" (out),
95 "i" (SBC_PROTO_FIXED4_SCALE)
96 : "memory",
97 "d0", "d1", "d2", "d3", "d4", "d5",
98 "d6", "d7", "d8", "d9", "d10", "d11");
99 }
100
101 static inline void _sbc_analyze_eight_neon(const int16_t *in, int32_t *out,
102 const FIXED_T *consts)
103 {
104 /* TODO: merge even and odd cases (or even merge all four calls to this
105 * function) in order to have only aligned reads from 'in' array
106 * and reduce number of load instructions */
107 __asm__ volatile (
108 "vld1.16 {d4, d5}, [%0, :64]!\n"
109 "vld1.16 {d8, d9}, [%1, :128]!\n"
110
111 "vmull.s16 q6, d4, d8\n"
112 "vld1.16 {d6, d7}, [%0, :64]!\n"
113 "vmull.s16 q7, d5, d9\n"
114 "vld1.16 {d10, d11}, [%1, :128]!\n"
115 "vmull.s16 q8, d6, d10\n"
116 "vld1.16 {d4, d5}, [%0, :64]!\n"
117 "vmull.s16 q9, d7, d11\n"
118 "vld1.16 {d8, d9}, [%1, :128]!\n"
119
120 "vmlal.s16 q6, d4, d8\n"
121 "vld1.16 {d6, d7}, [%0, :64]!\n"
122 "vmlal.s16 q7, d5, d9\n"
123 "vld1.16 {d10, d11}, [%1, :128]!\n"
124 "vmlal.s16 q8, d6, d10\n"
125 "vld1.16 {d4, d5}, [%0, :64]!\n"
126 "vmlal.s16 q9, d7, d11\n"
127 "vld1.16 {d8, d9}, [%1, :128]!\n"
128
129 "vmlal.s16 q6, d4, d8\n"
130 "vld1.16 {d6, d7}, [%0, :64]!\n"
131 "vmlal.s16 q7, d5, d9\n"
132 "vld1.16 {d10, d11}, [%1, :128]!\n"
133 "vmlal.s16 q8, d6, d10\n"
134 "vld1.16 {d4, d5}, [%0, :64]!\n"
135 "vmlal.s16 q9, d7, d11\n"
136 "vld1.16 {d8, d9}, [%1, :128]!\n"
137
138 "vmlal.s16 q6, d4, d8\n"
139 "vld1.16 {d6, d7}, [%0, :64]!\n"
140 "vmlal.s16 q7, d5, d9\n"
141 "vld1.16 {d10, d11}, [%1, :128]!\n"
142 "vmlal.s16 q8, d6, d10\n"
143 "vld1.16 {d4, d5}, [%0, :64]!\n"
144 "vmlal.s16 q9, d7, d11\n"
145 "vld1.16 {d8, d9}, [%1, :128]!\n"
146
147 "vmlal.s16 q6, d4, d8\n"
148 "vld1.16 {d6, d7}, [%0, :64]!\n"
149 "vmlal.s16 q7, d5, d9\n"
150 "vld1.16 {d10, d11}, [%1, :128]!\n"
151
152 "vmlal.s16 q8, d6, d10\n"
153 "vmlal.s16 q9, d7, d11\n"
154
155 "vpadd.s32 d0, d12, d13\n"
156 "vpadd.s32 d1, d14, d15\n"
157 "vpadd.s32 d2, d16, d17\n"
158 "vpadd.s32 d3, d18, d19\n"
159
160 "vrshr.s32 q0, q0, %3\n"
161 "vrshr.s32 q1, q1, %3\n"
162 "vmovn.s32 d0, q0\n"
163 "vmovn.s32 d1, q1\n"
164
165 "vdup.i32 d3, d1[1]\n" /* TODO: can be eliminated */
166 "vdup.i32 d2, d1[0]\n" /* TODO: can be eliminated */
167 "vdup.i32 d1, d0[1]\n" /* TODO: can be eliminated */
168 "vdup.i32 d0, d0[0]\n" /* TODO: can be eliminated */
169
170 "vld1.16 {d4, d5}, [%1, :128]!\n"
171 "vmull.s16 q6, d4, d0\n"
172 "vld1.16 {d6, d7}, [%1, :128]!\n"
173 "vmull.s16 q7, d5, d0\n"
174 "vmull.s16 q8, d6, d0\n"
175 "vmull.s16 q9, d7, d0\n"
176
177 "vld1.16 {d4, d5}, [%1, :128]!\n"
178 "vmlal.s16 q6, d4, d1\n"
179 "vld1.16 {d6, d7}, [%1, :128]!\n"
180 "vmlal.s16 q7, d5, d1\n"
181 "vmlal.s16 q8, d6, d1\n"
182 "vmlal.s16 q9, d7, d1\n"
183
184 "vld1.16 {d4, d5}, [%1, :128]!\n"
185 "vmlal.s16 q6, d4, d2\n"
186 "vld1.16 {d6, d7}, [%1, :128]!\n"
187 "vmlal.s16 q7, d5, d2\n"
188 "vmlal.s16 q8, d6, d2\n"
189 "vmlal.s16 q9, d7, d2\n"
190
191 "vld1.16 {d4, d5}, [%1, :128]!\n"
192 "vmlal.s16 q6, d4, d3\n"
193 "vld1.16 {d6, d7}, [%1, :128]!\n"
194 "vmlal.s16 q7, d5, d3\n"
195 "vmlal.s16 q8, d6, d3\n"
196 "vmlal.s16 q9, d7, d3\n"
197
198 "vpadd.s32 d0, d12, d13\n" /* TODO: can be eliminated */
199 "vpadd.s32 d1, d14, d15\n" /* TODO: can be eliminated */
200 "vpadd.s32 d2, d16, d17\n" /* TODO: can be eliminated */
201 "vpadd.s32 d3, d18, d19\n" /* TODO: can be eliminated */
202
203 "vst1.32 {d0, d1, d2, d3}, [%2, :128]\n"
204 : "+r" (in), "+r" (consts)
205 : "r" (out),
206 "i" (SBC_PROTO_FIXED8_SCALE)
207 : "memory",
208 "d0", "d1", "d2", "d3", "d4", "d5",
209 "d6", "d7", "d8", "d9", "d10", "d11",
210 "d12", "d13", "d14", "d15", "d16", "d17",
211 "d18", "d19");
212 }
213
214 static inline void sbc_analyze_4b_4s_neon(int16_t *x,
215 int32_t *out, int out_stride)
216 {
217 /* Analyze blocks */
218 _sbc_analyze_four_neon(x + 12, out, analysis_consts_fixed4_simd_odd);
219 out += out_stride;
220 _sbc_analyze_four_neon(x + 8, out, analysis_consts_fixed4_simd_even);
221 out += out_stride;
222 _sbc_analyze_four_neon(x + 4, out, analysis_consts_fixed4_simd_odd);
223 out += out_stride;
224 _sbc_analyze_four_neon(x + 0, out, analysis_consts_fixed4_simd_even);
225 }
226
227 static inline void sbc_analyze_4b_8s_neon(int16_t *x,
228 int32_t *out, int out_stride)
229 {
230 /* Analyze blocks */
231 _sbc_analyze_eight_neon(x + 24, out, analysis_consts_fixed8_simd_odd);
232 out += out_stride;
233 _sbc_analyze_eight_neon(x + 16, out, analysis_consts_fixed8_simd_even);
234 out += out_stride;
235 _sbc_analyze_eight_neon(x + 8, out, analysis_consts_fixed8_simd_odd);
236 out += out_stride;
237 _sbc_analyze_eight_neon(x + 0, out, analysis_consts_fixed8_simd_even);
238 }
239
240 static void sbc_calc_scalefactors_neon(
241 int32_t sb_sample_f[16][2][8],
242 uint32_t scale_factor[2][8],
243 int blocks, int channels, int subbands)
244 {
245 int ch, sb;
246 for (ch = 0; ch < channels; ch++) {
247 for (sb = 0; sb < subbands; sb += 4) {
248 int blk = blocks;
249 int32_t *in = &sb_sample_f[0][ch][sb];
250 __asm__ volatile (
251 "vmov.s32 q0, #0\n"
252 "vmov.s32 q1, %[c1]\n"
253 "vmov.s32 q14, #1\n"
254 "vmov.s32 q15, %[c2]\n"
255 "vadd.s32 q1, q1, q14\n"
256 "1:\n"
257 "vld1.32 {d16, d17}, [%[in], :128], %[inc]\n"
258 "vabs.s32 q8, q8\n"
259 "vld1.32 {d18, d19}, [%[in], :128], %[inc]\n"
260 "vabs.s32 q9, q9\n"
261 "vld1.32 {d20, d21}, [%[in], :128], %[inc]\n"
262 "vabs.s32 q10, q10\n"
263 "vld1.32 {d22, d23}, [%[in], :128], %[inc]\n"
264 "vabs.s32 q11, q11\n"
265 "vmax.s32 q0, q0, q8\n"
266 "vmax.s32 q1, q1, q9\n"
267 "vmax.s32 q0, q0, q10\n"
268 "vmax.s32 q1, q1, q11\n"
269 "subs %[blk], %[blk], #4\n"
270 "bgt 1b\n"
271 "vmax.s32 q0, q0, q1\n"
272 "vsub.s32 q0, q0, q14\n"
273 "vclz.s32 q0, q0\n"
274 "vsub.s32 q0, q15, q0\n"
275 "vst1.32 {d0, d1}, [%[out], :128]\n"
276 :
277 [blk] "+r" (blk),
278 [in] "+r" (in)
279 :
280 [inc] "r" ((char *) &sb_sample_f[1][0][0] -
281 (char *) &sb_sample_f[0][0][0]),
282 [out] "r" (&scale_factor[ch][sb]),
283 [c1] "i" (1 << SCALE_OUT_BITS),
284 [c2] "i" (31 - SCALE_OUT_BITS)
285 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19",
286 "d20", "d21", "d22", "d23", "d24", "d25", "d26",
287 "d27", "d28", "d29", "d30", "d31", "cc", "memory");
288 }
289 }
290 }
291
292 int sbc_calc_scalefactors_j_neon(
293 int32_t sb_sample_f[16][2][8],
294 uint32_t scale_factor[2][8],
295 int blocks, int subbands)
296 {
297 static SBC_ALIGNED int32_t joint_bits_mask[8] = {
298 8, 4, 2, 1, 128, 64, 32, 16
299 };
300 int joint, i;
301 int32_t *in0, *in1;
302 int32_t *in = &sb_sample_f[0][0][0];
303 uint32_t *out0, *out1;
304 uint32_t *out = &scale_factor[0][0];
305 int32_t *consts = joint_bits_mask;
306
307 i = subbands;
308
309 __asm__ volatile (
310 /*
311 * constants: q13 = (31 - SCALE_OUT_BITS), q14 = 1
312 * input: q0 = ((1 << SCALE_OUT_BITS) + 1)
313 * %[in0] - samples for channel 0
314 * %[in1] - samples for shannel 1
315 * output: q0, q1 - scale factors without joint stereo
316 * q2, q3 - scale factors with joint stereo
317 * q15 - joint stereo selection mask
318 */
319 ".macro calc_scalefactors\n"
320 "vmov.s32 q1, q0\n"
321 "vmov.s32 q2, q0\n"
322 "vmov.s32 q3, q0\n"
323 "mov %[i], %[blocks]\n"
324 "1:\n"
325 "vld1.32 {d18, d19}, [%[in1], :128], %[inc]\n"
326 "vbic.s32 q11, q9, q14\n"
327 "vld1.32 {d16, d17}, [%[in0], :128], %[inc]\n"
328 "vhadd.s32 q10, q8, q11\n"
329 "vhsub.s32 q11, q8, q11\n"
330 "vabs.s32 q8, q8\n"
331 "vabs.s32 q9, q9\n"
332 "vabs.s32 q10, q10\n"
333 "vabs.s32 q11, q11\n"
334 "vmax.s32 q0, q0, q8\n"
335 "vmax.s32 q1, q1, q9\n"
336 "vmax.s32 q2, q2, q10\n"
337 "vmax.s32 q3, q3, q11\n"
338 "subs %[i], %[i], #1\n"
339 "bgt 1b\n"
340 "vsub.s32 q0, q0, q14\n"
341 "vsub.s32 q1, q1, q14\n"
342 "vsub.s32 q2, q2, q14\n"
343 "vsub.s32 q3, q3, q14\n"
344 "vclz.s32 q0, q0\n"
345 "vclz.s32 q1, q1\n"
346 "vclz.s32 q2, q2\n"
347 "vclz.s32 q3, q3\n"
348 "vsub.s32 q0, q13, q0\n"
349 "vsub.s32 q1, q13, q1\n"
350 "vsub.s32 q2, q13, q2\n"
351 "vsub.s32 q3, q13, q3\n"
352 ".endm\n"
353 /*
354 * constants: q14 = 1
355 * input: q15 - joint stereo selection mask
356 * %[in0] - value set by calc_scalefactors macro
357 * %[in1] - value set by calc_scalefactors macro
358 */
359 ".macro update_joint_stereo_samples\n"
360 "sub %[out1], %[in1], %[inc]\n"
361 "sub %[out0], %[in0], %[inc]\n"
362 "sub %[in1], %[in1], %[inc], asl #1\n"
363 "sub %[in0], %[in0], %[inc], asl #1\n"
364 "vld1.32 {d18, d19}, [%[in1], :128]\n"
365 "vbic.s32 q11, q9, q14\n"
366 "vld1.32 {d16, d17}, [%[in0], :128]\n"
367 "vld1.32 {d2, d3}, [%[out1], :128]\n"
368 "vbic.s32 q3, q1, q14\n"
369 "vld1.32 {d0, d1}, [%[out0], :128]\n"
370 "vhsub.s32 q10, q8, q11\n"
371 "vhadd.s32 q11, q8, q11\n"
372 "vhsub.s32 q2, q0, q3\n"
373 "vhadd.s32 q3, q0, q3\n"
374 "vbif.s32 q10, q9, q15\n"
375 "vbif.s32 d22, d16, d30\n"
376 "sub %[inc], %[zero], %[inc], asl #1\n"
377 "sub %[i], %[blocks], #2\n"
378 "2:\n"
379 "vbif.s32 d23, d17, d31\n"
380 "vst1.32 {d20, d21}, [%[in1], :128], %[inc]\n"
381 "vbif.s32 d4, d2, d30\n"
382 "vld1.32 {d18, d19}, [%[in1], :128]\n"
383 "vbif.s32 d5, d3, d31\n"
384 "vst1.32 {d22, d23}, [%[in0], :128], %[inc]\n"
385 "vbif.s32 d6, d0, d30\n"
386 "vld1.32 {d16, d17}, [%[in0], :128]\n"
387 "vbif.s32 d7, d1, d31\n"
388 "vst1.32 {d4, d5}, [%[out1], :128], %[inc]\n"
389 "vbic.s32 q11, q9, q14\n"
390 "vld1.32 {d2, d3}, [%[out1], :128]\n"
391 "vst1.32 {d6, d7}, [%[out0], :128], %[inc]\n"
392 "vbic.s32 q3, q1, q14\n"
393 "vld1.32 {d0, d1}, [%[out0], :128]\n"
394 "vhsub.s32 q10, q8, q11\n"
395 "vhadd.s32 q11, q8, q11\n"
396 "vhsub.s32 q2, q0, q3\n"
397 "vhadd.s32 q3, q0, q3\n"
398 "vbif.s32 q10, q9, q15\n"
399 "vbif.s32 d22, d16, d30\n"
400 "subs %[i], %[i], #2\n"
401 "bgt 2b\n"
402 "sub %[inc], %[zero], %[inc], asr #1\n"
403 "vbif.s32 d23, d17, d31\n"
404 "vst1.32 {d20, d21}, [%[in1], :128]\n"
405 "vbif.s32 q2, q1, q15\n"
406 "vst1.32 {d22, d23}, [%[in0], :128]\n"
407 "vbif.s32 q3, q0, q15\n"
408 "vst1.32 {d4, d5}, [%[out1], :128]\n"
409 "vst1.32 {d6, d7}, [%[out0], :128]\n"
410 ".endm\n"
411
412 "vmov.s32 q14, #1\n"
413 "vmov.s32 q13, %[c2]\n"
414
415 "cmp %[i], #4\n"
416 "bne 8f\n"
417
418 "4:\n" /* 4 subbands */
419 "add %[in0], %[in], #0\n"
420 "add %[in1], %[in], #32\n"
421 "add %[out0], %[out], #0\n"
422 "add %[out1], %[out], #32\n"
423 "vmov.s32 q0, %[c1]\n"
424 "vadd.s32 q0, q0, q14\n"
425
426 "calc_scalefactors\n"
427
428 /* check whether to use joint stereo for subbands 0, 1, 2 */
429 "vadd.s32 q15, q0, q1\n"
430 "vadd.s32 q9, q2, q3\n"
431 "vmov.s32 d31[1], %[zero]\n" /* last subband -> no joint */
432 "vld1.32 {d16, d17}, [%[consts], :128]!\n"
433 "vcgt.s32 q15, q15, q9\n"
434
435 /* calculate and save to memory 'joint' variable */
436 /* update and save scale factors to memory */
437 " vand.s32 q8, q8, q15\n"
438 "vbit.s32 q0, q2, q15\n"
439 " vpadd.s32 d16, d16, d17\n"
440 "vbit.s32 q1, q3, q15\n"
441 " vpadd.s32 d16, d16, d16\n"
442 "vst1.32 {d0, d1}, [%[out0], :128]\n"
443 "vst1.32 {d2, d3}, [%[out1], :128]\n"
444 " vst1.32 {d16[0]}, [%[joint]]\n"
445
446 "update_joint_stereo_samples\n"
447 "b 9f\n"
448
449 "8:\n" /* 8 subbands */
450 "add %[in0], %[in], #16\n\n"
451 "add %[in1], %[in], #48\n"
452 "add %[out0], %[out], #16\n\n"
453 "add %[out1], %[out], #48\n"
454 "vmov.s32 q0, %[c1]\n"
455 "vadd.s32 q0, q0, q14\n"
456
457 "calc_scalefactors\n"
458
459 /* check whether to use joint stereo for subbands 4, 5, 6 */
460 "vadd.s32 q15, q0, q1\n"
461 "vadd.s32 q9, q2, q3\n"
462 "vmov.s32 d31[1], %[zero]\n" /* last subband -> no joint */
463 "vld1.32 {d16, d17}, [%[consts], :128]!\n"
464 "vcgt.s32 q15, q15, q9\n"
465
466 /* calculate part of 'joint' variable and save it to d24 */
467 /* update and save scale factors to memory */
468 " vand.s32 q8, q8, q15\n"
469 "vbit.s32 q0, q2, q15\n"
470 " vpadd.s32 d16, d16, d17\n"
471 "vbit.s32 q1, q3, q15\n"
472 "vst1.32 {d0, d1}, [%[out0], :128]\n"
473 "vst1.32 {d2, d3}, [%[out1], :128]\n"
474 " vpadd.s32 d24, d16, d16\n"
475
476 "update_joint_stereo_samples\n"
477
478 "add %[in0], %[in], #0\n"
479 "add %[in1], %[in], #32\n"
480 "add %[out0], %[out], #0\n\n"
481 "add %[out1], %[out], #32\n"
482 "vmov.s32 q0, %[c1]\n"
483 "vadd.s32 q0, q0, q14\n"
484
485 "calc_scalefactors\n"
486
487 /* check whether to use joint stereo for subbands 0, 1, 2, 3 */
488 "vadd.s32 q15, q0, q1\n"
489 "vadd.s32 q9, q2, q3\n"
490 "vld1.32 {d16, d17}, [%[consts], :128]!\n"
491 "vcgt.s32 q15, q15, q9\n"
492
493 /* combine last part of 'joint' with d24 and save to memory */
494 /* update and save scale factors to memory */
495 " vand.s32 q8, q8, q15\n"
496 "vbit.s32 q0, q2, q15\n"
497 " vpadd.s32 d16, d16, d17\n"
498 "vbit.s32 q1, q3, q15\n"
499 " vpadd.s32 d16, d16, d16\n"
500 "vst1.32 {d0, d1}, [%[out0], :128]\n"
501 " vadd.s32 d16, d16, d24\n"
502 "vst1.32 {d2, d3}, [%[out1], :128]\n"
503 " vst1.32 {d16[0]}, [%[joint]]\n"
504
505 "update_joint_stereo_samples\n"
506 "9:\n"
507 ".purgem calc_scalefactors\n"
508 ".purgem update_joint_stereo_samples\n"
509 :
510 [i] "+&r" (i),
511 [in] "+&r" (in),
512 [in0] "=&r" (in0),
513 [in1] "=&r" (in1),
514 [out] "+&r" (out),
515 [out0] "=&r" (out0),
516 [out1] "=&r" (out1),
517 [consts] "+&r" (consts)
518 :
519 [inc] "r" ((char *) &sb_sample_f[1][0][0] -
520 (char *) &sb_sample_f[0][0][0]),
521 [blocks] "r" (blocks),
522 [joint] "r" (&joint),
523 [c1] "i" (1 << SCALE_OUT_BITS),
524 [c2] "i" (31 - SCALE_OUT_BITS),
525 [zero] "r" (0)
526 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
527 "d16", "d17", "d18", "d19", "d20", "d21", "d22",
528 "d23", "d24", "d25", "d26", "d27", "d28", "d29",
529 "d30", "d31", "cc", "memory");
530
531 return joint;
532 }
533
534 #define PERM_BE(a, b, c, d) { \
535 (a * 2) + 1, (a * 2) + 0, \
536 (b * 2) + 1, (b * 2) + 0, \
537 (c * 2) + 1, (c * 2) + 0, \
538 (d * 2) + 1, (d * 2) + 0 \
539 }
540 #define PERM_LE(a, b, c, d) { \
541 (a * 2) + 0, (a * 2) + 1, \
542 (b * 2) + 0, (b * 2) + 1, \
543 (c * 2) + 0, (c * 2) + 1, \
544 (d * 2) + 0, (d * 2) + 1 \
545 }
546
547 static SBC_ALWAYS_INLINE int sbc_enc_process_input_4s_neon_internal(
548 int position,
549 const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE],
550 int nsamples, int nchannels, int big_endian)
551 {
552 static SBC_ALIGNED uint8_t perm_be[2][8] = {
553 PERM_BE(7, 3, 6, 4),
554 PERM_BE(0, 2, 1, 5)
555 };
556 static SBC_ALIGNED uint8_t perm_le[2][8] = {
557 PERM_LE(7, 3, 6, 4),
558 PERM_LE(0, 2, 1, 5)
559 };
560 /* handle X buffer wraparound */
561 if (position < nsamples) {
562 int16_t *dst = &X[0][SBC_X_BUFFER_SIZE - 40];
563 int16_t *src = &X[0][position];
564 __asm__ volatile (
565 "vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
566 "vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
567 "vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
568 "vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
569 "vld1.16 {d0}, [%[src], :64]!\n"
570 "vst1.16 {d0}, [%[dst], :64]!\n"
571 :
572 [dst] "+r" (dst),
573 [src] "+r" (src)
574 : : "memory", "d0", "d1", "d2", "d3");
575 if (nchannels > 1) {
576 dst = &X[1][SBC_X_BUFFER_SIZE - 40];
577 src = &X[1][position];
578 __asm__ volatile (
579 "vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
580 "vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
581 "vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
582 "vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
583 "vld1.16 {d0}, [%[src], :64]!\n"
584 "vst1.16 {d0}, [%[dst], :64]!\n"
585 :
586 [dst] "+r" (dst),
587 [src] "+r" (src)
588 : : "memory", "d0", "d1", "d2", "d3");
589 }
590 position = SBC_X_BUFFER_SIZE - 40;
591 }
592
593 if ((nchannels > 1) && ((uintptr_t)pcm & 1)) {
594 /* poor 'pcm' alignment */
595 int16_t *x = &X[0][position];
596 int16_t *y = &X[1][position];
597 __asm__ volatile (
598 "vld1.8 {d0, d1}, [%[perm], :128]\n"
599 "1:\n"
600 "sub %[x], %[x], #16\n"
601 "sub %[y], %[y], #16\n"
602 "sub %[position], %[position], #8\n"
603 "vld1.8 {d4, d5}, [%[pcm]]!\n"
604 "vuzp.16 d4, d5\n"
605 "vld1.8 {d20, d21}, [%[pcm]]!\n"
606 "vuzp.16 d20, d21\n"
607 "vswp d5, d20\n"
608 "vtbl.8 d16, {d4, d5}, d0\n"
609 "vtbl.8 d17, {d4, d5}, d1\n"
610 "vtbl.8 d18, {d20, d21}, d0\n"
611 "vtbl.8 d19, {d20, d21}, d1\n"
612 "vst1.16 {d16, d17}, [%[x], :128]\n"
613 "vst1.16 {d18, d19}, [%[y], :128]\n"
614 "subs %[nsamples], %[nsamples], #8\n"
615 "bgt 1b\n"
616 :
617 [x] "+r" (x),
618 [y] "+r" (y),
619 [pcm] "+r" (pcm),
620 [nsamples] "+r" (nsamples),
621 [position] "+r" (position)
622 :
623 [perm] "r" (big_endian ? perm_be : perm_le)
624 : "cc", "memory", "d0", "d1", "d2", "d3", "d4",
625 "d5", "d6", "d7", "d16", "d17", "d18", "d19",
626 "d20", "d21", "d22", "d23");
627 } else if (nchannels > 1) {
628 /* proper 'pcm' alignment */
629 int16_t *x = &X[0][position];
630 int16_t *y = &X[1][position];
631 __asm__ volatile (
632 "vld1.8 {d0, d1}, [%[perm], :128]\n"
633 "1:\n"
634 "sub %[x], %[x], #16\n"
635 "sub %[y], %[y], #16\n"
636 "sub %[position], %[position], #8\n"
637 "vld2.16 {d4, d5}, [%[pcm]]!\n"
638 "vld2.16 {d20, d21}, [%[pcm]]!\n"
639 "vswp d5, d20\n"
640 "vtbl.8 d16, {d4, d5}, d0\n"
641 "vtbl.8 d17, {d4, d5}, d1\n"
642 "vtbl.8 d18, {d20, d21}, d0\n"
643 "vtbl.8 d19, {d20, d21}, d1\n"
644 "vst1.16 {d16, d17}, [%[x], :128]\n"
645 "vst1.16 {d18, d19}, [%[y], :128]\n"
646 "subs %[nsamples], %[nsamples], #8\n"
647 "bgt 1b\n"
648 :
649 [x] "+r" (x),
650 [y] "+r" (y),
651 [pcm] "+r" (pcm),
652 [nsamples] "+r" (nsamples),
653 [position] "+r" (position)
654 :
655 [perm] "r" (big_endian ? perm_be : perm_le)
656 : "cc", "memory", "d0", "d1", "d2", "d3", "d4",
657 "d5", "d6", "d7", "d16", "d17", "d18", "d19",
658 "d20", "d21", "d22", "d23");
659 } else {
660 int16_t *x = &X[0][position];
661 __asm__ volatile (
662 "vld1.8 {d0, d1}, [%[perm], :128]\n"
663 "1:\n"
664 "sub %[x], %[x], #16\n"
665 "sub %[position], %[position], #8\n"
666 "vld1.8 {d4, d5}, [%[pcm]]!\n"
667 "vtbl.8 d16, {d4, d5}, d0\n"
668 "vtbl.8 d17, {d4, d5}, d1\n"
669 "vst1.16 {d16, d17}, [%[x], :128]\n"
670 "subs %[nsamples], %[nsamples], #8\n"
671 "bgt 1b\n"
672 :
673 [x] "+r" (x),
674 [pcm] "+r" (pcm),
675 [nsamples] "+r" (nsamples),
676 [position] "+r" (position)
677 :
678 [perm] "r" (big_endian ? perm_be : perm_le)
679 : "cc", "memory", "d0", "d1", "d2", "d3", "d4",
680 "d5", "d6", "d7", "d16", "d17", "d18", "d19");
681 }
682 return position;
683 }
684
685 static SBC_ALWAYS_INLINE int sbc_enc_process_input_8s_neon_internal(
686 int position,
687 const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE],
688 int nsamples, int nchannels, int big_endian)
689 {
690 static SBC_ALIGNED uint8_t perm_be[4][8] = {
691 PERM_BE(15, 7, 14, 8),
692 PERM_BE(13, 9, 12, 10),
693 PERM_BE(11, 3, 6, 0),
694 PERM_BE(5, 1, 4, 2)
695 };
696 static SBC_ALIGNED uint8_t perm_le[4][8] = {
697 PERM_LE(15, 7, 14, 8),
698 PERM_LE(13, 9, 12, 10),
699 PERM_LE(11, 3, 6, 0),
700 PERM_LE(5, 1, 4, 2)
701 };
702 /* handle X buffer wraparound */
703 if (position < nsamples) {
704 int16_t *dst = &X[0][SBC_X_BUFFER_SIZE - 72];
705 int16_t *src = &X[0][position];
706 __asm__ volatile (
707 "vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
708 "vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
709 "vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
710 "vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
711 "vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
712 "vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
713 "vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
714 "vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
715 "vld1.16 {d0, d1}, [%[src], :128]!\n"
716 "vst1.16 {d0, d1}, [%[dst], :128]!\n"
717 :
718 [dst] "+r" (dst),
719 [src] "+r" (src)
720 : : "memory", "d0", "d1", "d2", "d3");
721 if (nchannels > 1) {
722 dst = &X[1][SBC_X_BUFFER_SIZE - 72];
723 src = &X[1][position];
724 __asm__ volatile (
725 "vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
726 "vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
727 "vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
728 "vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
729 "vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
730 "vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
731 "vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
732 "vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
733 "vld1.16 {d0, d1}, [%[src], :128]!\n"
734 "vst1.16 {d0, d1}, [%[dst], :128]!\n"
735 :
736 [dst] "+r" (dst),
737 [src] "+r" (src)
738 : : "memory", "d0", "d1", "d2", "d3");
739 }
740 position = SBC_X_BUFFER_SIZE - 72;
741 }
742
743 if ((nchannels > 1) && ((uintptr_t)pcm & 1)) {
744 /* poor 'pcm' alignment */
745 int16_t *x = &X[0][position];
746 int16_t *y = &X[1][position];
747 __asm__ volatile (
748 "vld1.8 {d0, d1, d2, d3}, [%[perm], :128]\n"
749 "1:\n"
750 "sub %[x], %[x], #32\n"
751 "sub %[y], %[y], #32\n"
752 "sub %[position], %[position], #16\n"
753 "vld1.8 {d4, d5, d6, d7}, [%[pcm]]!\n"
754 "vuzp.16 q2, q3\n"
755 "vld1.8 {d20, d21, d22, d23}, [%[pcm]]!\n"
756 "vuzp.16 q10, q11\n"
757 "vswp q3, q10\n"
758 "vtbl.8 d16, {d4, d5, d6, d7}, d0\n"
759 "vtbl.8 d17, {d4, d5, d6, d7}, d1\n"
760 "vtbl.8 d18, {d4, d5, d6, d7}, d2\n"
761 "vtbl.8 d19, {d4, d5, d6, d7}, d3\n"
762 "vst1.16 {d16, d17, d18, d19}, [%[x], :128]\n"
763 "vtbl.8 d16, {d20, d21, d22, d23}, d0\n"
764 "vtbl.8 d17, {d20, d21, d22, d23}, d1\n"
765 "vtbl.8 d18, {d20, d21, d22, d23}, d2\n"
766 "vtbl.8 d19, {d20, d21, d22, d23}, d3\n"
767 "vst1.16 {d16, d17, d18, d19}, [%[y], :128]\n"
768 "subs %[nsamples], %[nsamples], #16\n"
769 "bgt 1b\n"
770 :
771 [x] "+r" (x),
772 [y] "+r" (y),
773 [pcm] "+r" (pcm),
774 [nsamples] "+r" (nsamples),
775 [position] "+r" (position)
776 :
777 [perm] "r" (big_endian ? perm_be : perm_le)
778 : "cc", "memory", "d0", "d1", "d2", "d3", "d4",
779 "d5", "d6", "d7", "d16", "d17", "d18", "d19",
780 "d20", "d21", "d22", "d23");
781 } else if (nchannels > 1) {
782 /* proper 'pcm' alignment */
783 int16_t *x = &X[0][position];
784 int16_t *y = &X[1][position];
785 __asm__ volatile (
786 "vld1.8 {d0, d1, d2, d3}, [%[perm], :128]\n"
787 "1:\n"
788 "sub %[x], %[x], #32\n"
789 "sub %[y], %[y], #32\n"
790 "sub %[position], %[position], #16\n"
791 "vld2.16 {d4, d5, d6, d7}, [%[pcm]]!\n"
792 "vld2.16 {d20, d21, d22, d23}, [%[pcm]]!\n"
793 "vswp q3, q10\n"
794 "vtbl.8 d16, {d4, d5, d6, d7}, d0\n"
795 "vtbl.8 d17, {d4, d5, d6, d7}, d1\n"
796 "vtbl.8 d18, {d4, d5, d6, d7}, d2\n"
797 "vtbl.8 d19, {d4, d5, d6, d7}, d3\n"
798 "vst1.16 {d16, d17, d18, d19}, [%[x], :128]\n"
799 "vtbl.8 d16, {d20, d21, d22, d23}, d0\n"
800 "vtbl.8 d17, {d20, d21, d22, d23}, d1\n"
801 "vtbl.8 d18, {d20, d21, d22, d23}, d2\n"
802 "vtbl.8 d19, {d20, d21, d22, d23}, d3\n"
803 "vst1.16 {d16, d17, d18, d19}, [%[y], :128]\n"
804 "subs %[nsamples], %[nsamples], #16\n"
805 "bgt 1b\n"
806 :
807 [x] "+r" (x),
808 [y] "+r" (y),
809 [pcm] "+r" (pcm),
810 [nsamples] "+r" (nsamples),
811 [position] "+r" (position)
812 :
813 [perm] "r" (big_endian ? perm_be : perm_le)
814 : "cc", "memory", "d0", "d1", "d2", "d3", "d4",
815 "d5", "d6", "d7", "d16", "d17", "d18", "d19",
816 "d20", "d21", "d22", "d23");
817 } else {
818 int16_t *x = &X[0][position];
819 __asm__ volatile (
820 "vld1.8 {d0, d1, d2, d3}, [%[perm], :128]\n"
821 "1:\n"
822 "sub %[x], %[x], #32\n"
823 "sub %[position], %[position], #16\n"
824 "vld1.8 {d4, d5, d6, d7}, [%[pcm]]!\n"
825 "vtbl.8 d16, {d4, d5, d6, d7}, d0\n"
826 "vtbl.8 d17, {d4, d5, d6, d7}, d1\n"
827 "vtbl.8 d18, {d4, d5, d6, d7}, d2\n"
828 "vtbl.8 d19, {d4, d5, d6, d7}, d3\n"
829 "vst1.16 {d16, d17, d18, d19}, [%[x], :128]\n"
830 "subs %[nsamples], %[nsamples], #16\n"
831 "bgt 1b\n"
832 :
833 [x] "+r" (x),
834 [pcm] "+r" (pcm),
835 [nsamples] "+r" (nsamples),
836 [position] "+r" (position)
837 :
838 [perm] "r" (big_endian ? perm_be : perm_le)
839 : "cc", "memory", "d0", "d1", "d2", "d3", "d4",
840 "d5", "d6", "d7", "d16", "d17", "d18", "d19");
841 }
842 return position;
843 }
844
845 #undef PERM_BE
846 #undef PERM_LE
847
848 static int sbc_enc_process_input_4s_be_neon(int position, const uint8_t *pcm,
849 int16_t X[2][SBC_X_BUFFER_SIZE],
850 int nsamples, int nchannels)
851 {
852 return sbc_enc_process_input_4s_neon_internal(
853 position, pcm, X, nsamples, nchannels, 1);
854 }
855
856 static int sbc_enc_process_input_4s_le_neon(int position, const uint8_t *pcm,
857 int16_t X[2][SBC_X_BUFFER_SIZE],
858 int nsamples, int nchannels)
859 {
860 return sbc_enc_process_input_4s_neon_internal(
861 position, pcm, X, nsamples, nchannels, 0);
862 }
863
864 static int sbc_enc_process_input_8s_be_neon(int position, const uint8_t *pcm,
865 int16_t X[2][SBC_X_BUFFER_SIZE],
866 int nsamples, int nchannels)
867 {
868 return sbc_enc_process_input_8s_neon_internal(
869 position, pcm, X, nsamples, nchannels, 1);
870 }
871
872 static int sbc_enc_process_input_8s_le_neon(int position, const uint8_t *pcm,
873 int16_t X[2][SBC_X_BUFFER_SIZE],
874 int nsamples, int nchannels)
875 {
876 return sbc_enc_process_input_8s_neon_internal(
877 position, pcm, X, nsamples, nchannels, 0);
878 }
879
880 void sbc_init_primitives_neon(struct sbc_encoder_state *state)
881 {
882 state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_neon;
883 state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_neon;
884 state->sbc_calc_scalefactors = sbc_calc_scalefactors_neon;
885 state->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j_neon;
886 state->sbc_enc_process_input_4s_le = sbc_enc_process_input_4s_le_neon;
887 state->sbc_enc_process_input_4s_be = sbc_enc_process_input_4s_be_neon;
888 state->sbc_enc_process_input_8s_le = sbc_enc_process_input_8s_le_neon;
889 state->sbc_enc_process_input_8s_be = sbc_enc_process_input_8s_be_neon;
890 state->implementation_info = "NEON";
891 }
892
893 #endif