adafruit
diff --git a/‎src/assembly.h
Lines changed: 26 additions & 0 deletions b/‎src/assembly.h
Lines changed: 26 additions & 0 deletions
diff --git a/‎src/polyphase.c
Lines changed: 25 additions & 12 deletions b/‎src/polyphase.c
Lines changed: 25 additions & 12 deletions
@@ -156,6 +156,20 @@ static __inline Word64 SHL64(Word64 x, int n)
 	}
 }
 
+// Shift right 26 and return the lower 16-bits (short)
+static __inline short SAR64_Clip(Word64 x)
+{
+    unsigned int xLo = ((unsigned int *)&x)[0];
+    int xHi = ((int *)&x)[1];
+
+        __asm {
+            mov        edx, xHi
+            mov        eax, xLo
+            mov        cl, 26
+            shrd    eax, edx, cl
+        }
+}
+
 static __inline Word64 SAR64(Word64 x, int n)
 {
 	unsigned int xLo = ((unsigned int *)&x)[0];
@@ -341,6 +355,18 @@ static __inline Word64 MADD64(Word64 sum64, int x, int y)
         return u.w64;
 }
 
+// Shift the 64-bit value right by 26 bits and return the lower 16-bits
+__attribute__((__always_inline__)) static __inline short SAR64_Clip(Word64 x)
+{
+  unsigned int xLo = (unsigned int) x;
+  int xHi = (int) (x >> 32);
+  __asm__ __volatile__(
+                        "lsr %1, %1, #26\n\t"  // xLo <- xLo>>n
+                        "orr %1, %1, %0, lsl #6\n\t"      // xLo <= xLo || (xHi << 6)
+                        : "+&r" (xHi), "+r" (xLo) );
+  return( (short)xLo );
+}
+
 __attribute__((__always_inline__)) static __inline Word64 SAR64(Word64 x, int n)
 {
   unsigned int xLo = (unsigned int) x;
 
@@ -142,7 +142,8 @@ void PolyphaseMono(short *pcm, int *vbuf, const int *coefBase)
 	MC0M(6)
 	MC0M(7)
 
-	*(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
+//	*(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
+	*(pcm + 0) = SAR64_Clip(sum1L);
 
 	/* special case, output sample 16 */
 	coef = coefBase + 256;
@@ -158,7 +159,8 @@ void PolyphaseMono(short *pcm, int *vbuf, const int *coefBase)
 	MC1M(6)
 	MC1M(7)
 
-	*(pcm + 16) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
+//	*(pcm + 16) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
+	*(pcm + 16) = SAR64_Clip(sum1L);
 
 	/* main convolution loop: sum1L = samples 1, 2, 3, ... 15   sum2L = samples 31, 30, ... 17 */
 	coef = coefBase + 16;
@@ -179,8 +181,10 @@ void PolyphaseMono(short *pcm, int *vbuf, const int *coefBase)
 		MC2M(7)
 
 		vb1 += 64;
-		*(pcm)       = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
-		*(pcm + 2*i) = ClipToShort((int)SAR64(sum2L, (32-CSHIFT)), DEF_NFRACBITS);
+//		*(pcm)       = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
+//		*(pcm + 2*i) = ClipToShort((int)SAR64(sum2L, (32-CSHIFT)), DEF_NFRACBITS);
+		*(pcm) = SAR64_Clip(sum1L);
+		*(pcm + 2*i) = SAR64_Clip(sum2L);
 		pcm++;
 	}
 }
@@ -255,8 +259,10 @@ void PolyphaseStereo(short *pcm, int *vbuf, const int *coefBase)
 	MC0S(6)
 	MC0S(7)
 
-	*(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
-	*(pcm + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
+//	*(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
+//	*(pcm + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
+	*(pcm + 0) = SAR64_Clip(sum1L);
+	*(pcm + 1) = SAR64_Clip(sum1R);
 
 	/* special case, output sample 16 */
 	coef = coefBase + 256;
@@ -272,8 +278,10 @@ void PolyphaseStereo(short *pcm, int *vbuf, const int *coefBase)
 	MC1S(6)
 	MC1S(7)
 
-	*(pcm + 2*16 + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
-	*(pcm + 2*16 + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
+//	*(pcm + 2*16 + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
+//	*(pcm + 2*16 + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
+	*(pcm + 2*16 + 0) = SAR64_Clip(sum1L);
+	*(pcm + 2*16 + 1) = SAR64_Clip(sum1R);
 
 	/* main convolution loop: sum1L = samples 1, 2, 3, ... 15   sum2L = samples 31, 30, ... 17 */
 	coef = coefBase + 16;
@@ -295,10 +303,15 @@ void PolyphaseStereo(short *pcm, int *vbuf, const int *coefBase)
 		MC2S(7)
 
 		vb1 += 64;
-		*(pcm + 0)         = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
-		*(pcm + 1)         = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
-		*(pcm + 2*2*i + 0) = ClipToShort((int)SAR64(sum2L, (32-CSHIFT)), DEF_NFRACBITS);
-		*(pcm + 2*2*i + 1) = ClipToShort((int)SAR64(sum2R, (32-CSHIFT)), DEF_NFRACBITS);
+//		*(pcm + 0)         = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
+//		*(pcm + 1)         = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
+		*(pcm + 0) = SAR64_Clip(sum1L);
+		*(pcm + 1) = SAR64_Clip(sum1R);
+//		*(pcm + 2*2*i + 0) = ClipToShort((int)SAR64(sum2L, (32-CSHIFT)), DEF_NFRACBITS);
+//		*(pcm + 2*2*i + 1) = ClipToShort((int)SAR64(sum2R, (32-CSHIFT)), DEF_NFRACBITS);
+		*(pcm + 2*2*i + 0) = SAR64_Clip(sum2L);
+		*(pcm + 2*2*i + 1) = SAR64_Clip(sum2R);
+
 		pcm += 2;
 	}
 }