8000 Merge pull request #9 from bitbank2/master · adafruit/Adafruit_MP3@4024ab6 · GitHub
[go: up one dir, main page]

Skip to content

Commit 4024ab6

Browse files
authored
Merge pull request #9 from bitbank2/master
added optimization for 64-bit shift and clip to 16-bits, nets 5-6% improvement
2 parents 8686130 + 1b1f774 commit 4024ab6

File tree

2 files changed

+51
-12
lines changed

2 files changed

+51
-12
lines changed

src/assembly.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,20 @@ static __inline Word64 SHL64(Word64 x, int n)
156156
}
157157
}
158158

159+
// Shift right 26 and return the lower 16-bits (short)
160+
static __inline short SAR64_Clip(Word64 x)
161+
{
162+
unsigned int xLo = ((unsigned int *)&x)[0];
163+
int xHi = ((int *)&x)[1];
164+
165+
__asm {
166+
mov edx, xHi
167+
mov eax, xLo
168+
mov cl, 26
169+
shrd eax, edx, cl
170+
}
171+
}
172+
159173
static __inline Word64 SAR64(Word64 x, int n)
160174
{
161175
unsigned int xLo = ((unsigned int *)&x)[0];
@@ -341,6 +355,18 @@ static __inline Word64 MADD64(Word64 sum64, int x, int y)
341355
return u.w64;
342356
}
343357

358+
// Shift the 64-bit value right by 26 bits and return the lower 16-bits
359+
__attribute__((__always_inline__)) static __inline short SAR64_Clip(Word64 x)
360+
{
361+
unsigned int xLo = (unsigned int) x;
362+
int xHi = (int) (x >> 32);
363+
__asm__ __volatile__(
364+
"lsr %1, %1, #26\n\t" // xLo <- xLo>>n
365+
"orr %1, %1, %0, lsl #6\n\t" // xLo <= xLo || (xHi << 6)
366+
: "+&r" (xHi), "+r" (xLo) );
367+
return( (short)xLo );
368+
}
369+
344370
__attribute__((__always_inline__)) static __inline Word64 SAR64(Word64 x, int n)
345371
{
346372
unsigned int xLo = (unsigned int) x;

src/polyphase.c

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,8 @@ void PolyphaseMono(short *pcm, int *vbuf, const int *coefBase)
142142
MC0M(6)
143143
MC0M(7)
144144

145-
*(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
145+
// *(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
146+
*(pcm + 0) = SAR64_Clip(sum1L);
146147

147148
/* special case, output sample 16 */
148149
coef = coefBase + 256;
@@ -158,7 +159,8 @@ void PolyphaseMono(short *pcm, int *vbuf, const int *coefBase)
158159
MC1M(6)
159160
MC1M(7)
160161

161-
*(pcm + 16) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
162+
// *(pcm + 16) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
163+
*(pcm + 16) = SAR64_Clip(sum1L);
162164

163165
/* main convolution loop: sum1L = samples 1, 2, 3, ... 15 sum2L = samples 31, 30, ... 17 */
164166
coef = coefBase + 16;
@@ -179,8 +181,10 @@ void PolyphaseMono(short *pcm, int *vbuf, const int *coefBase)
179181
MC2M(7)
180182

181183
vb1 += 64;
182-
*(pcm) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
183-
*(pcm + 2*i) = ClipToShort((int)SAR64(sum2L, (32-CSHIFT)), DEF_NFRACBITS);
184+
// *(pcm) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
185+
// *(pcm + 2*i) = ClipToShort((int)SAR64(sum2L, (32-CSHIFT)), DEF_NFRACBITS);
186+
*(pcm) = SAR64_Clip(sum1L);
187+
*(pcm + 2*i) = SAR64_Clip(sum2L);
184188
pcm++;
185189
}
186190
}
@@ -255,8 +259,10 @@ void PolyphaseStereo(short *pcm, int *vbuf, const int *coefBase)
255259
MC0S(6)
256260
MC0S(7)
257261

258-
*(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
259-
*(pcm + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
262+
// *(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
263+
// *(pcm + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
264+
*(pcm + 0) = SAR64_Clip(sum1L);
265+
*(pcm + 1) = SAR64_Clip(sum1R);
260266

261267
/* special case, output sample 16 */
262268
coef = coefBase + 256;
@@ -272,8 +278,10 @@ void PolyphaseStereo(short *pcm, int *vbuf, const int *coefBase)
272278
MC1S(6)
273279
MC1S(7)
274280

275-
*(pcm + 2*16 + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
276-
*(pcm + 2*16 + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
281+
// *(pcm + 2*16 + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
282+
// *(pcm + 2*16 + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
283+
*(pcm + 2*16 + 0) = SAR64_Clip(sum1L);
284+
*(pcm + 2*16 + 1) = SAR64_Clip(sum1R);
277285

278286
/* main convolution loop: sum1L = samples 1, 2, 3, ... 15 sum2L = samples 31, 30, ... 17 */
279287
coef = coefBase + 16;
@@ -295,10 +303,15 @@ void PolyphaseStereo(short *pcm, int *vbuf, const int *coefBase)
295303
MC2S(7)
296304

297305
vb1 += 64;
298-
*(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
299-
*(pcm + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
300-
*(pcm + 2*2*i + 0) = ClipToShort((int)SAR64(sum2L, (32-CSHIFT)), DEF_NFRACBITS);
301-
*(pcm + 2*2*i + 1) = ClipToShort((int)SAR64(sum2R, (32-CSHIFT)), DEF_NFRACBITS);
306+
// *(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
307+
// *(pcm + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
308+
*(pcm + 0) = SAR64_Clip(sum1L);
309+
*(pcm + 1) = SAR64_Clip(sum1R);
310+
// *(pcm + 2*2*i + 0) = ClipToShort((int)SAR64(sum2L, (32-CSHIFT)), DEF_NFRACBITS);
311+
// *(pcm + 2*2*i + 1) = ClipToShort((int)SAR64(sum2R, (32-CSHIFT)), DEF_NFRACBITS);
312+
*(pcm + 2*2*i + 0) = SAR64_Clip(sum2L);
313+
*(pcm + 2*2*i + 1) = SAR64_Clip(sum2R);
314+
302315
pcm += 2;
303316
}
304317
}

0 commit comments

Comments
 (0)
0