276 lines
7.5 KiB
C
276 lines
7.5 KiB
C
/*
|
|
* filter-i386.h -- optimized filter routines
|
|
*
|
|
* Copyright (C) 1996
|
|
* Thomas Sailer (sailer@ife.ee.ethz.ch, hb9jnx@hb9w.che.eu)
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
*/
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
#ifndef _FILTER_I386_H
|
|
#define _FILTER_I386_H
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
#define __HAVE_ARCH_MAC
|
|
#define mac(a,b,size) \
|
|
(__builtin_constant_p(size) ? __mac_c((a),(b),(size)) : __mac_g((a),(b),(size)))
|
|
|
|
#include <stdio.h>
|
|
|
|
extern inline float __mac_g(const float *a, const float *b,
|
|
unsigned int size)
|
|
{
|
|
float sum = 0;
|
|
unsigned int i;
|
|
|
|
for (i = 0; i < size; i++)
|
|
sum += (*a++) * (*b++);
|
|
return sum;
|
|
}
|
|
|
|
extern inline float __mac_c(const float *a, const float *b,
|
|
unsigned int size)
|
|
{
|
|
float f;
|
|
|
|
/*
|
|
* inspired from Phil Karn, KA9Q's home page
|
|
*/
|
|
switch (size) {
|
|
case 53:
|
|
asm volatile ("flds (%1);\n\t"
|
|
"fmuls (%2);\n\t"
|
|
"flds 4(%1);\n\t"
|
|
"fmuls 4(%2);\n\t"
|
|
"flds 8(%1);\n\t"
|
|
"fmuls 8(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 12(%1);\n\t"
|
|
"fmuls 12(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 16(%1);\n\t"
|
|
"fmuls 16(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 20(%1);\n\t"
|
|
"fmuls 20(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 24(%1);\n\t"
|
|
"fmuls 24(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 28(%1);\n\t"
|
|
"fmuls 28(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 32(%1);\n\t"
|
|
"fmuls 32(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 36(%1);\n\t"
|
|
"fmuls 36(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 40(%1);\n\t"
|
|
"fmuls 40(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 44(%1);\n\t"
|
|
"fmuls 44(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 48(%1);\n\t"
|
|
"fmuls 48(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 52(%1);\n\t"
|
|
"fmuls 52(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 56(%1);\n\t"
|
|
"fmuls 56(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 60(%1);\n\t"
|
|
"fmuls 60(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 64(%1);\n\t"
|
|
"fmuls 64(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 68(%1);\n\t"
|
|
"fmuls 68(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 72(%1);\n\t"
|
|
"fmuls 72(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 76(%1);\n\t"
|
|
"fmuls 76(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 80(%1);\n\t"
|
|
"fmuls 80(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 84(%1);\n\t"
|
|
"fmuls 84(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 88(%1);\n\t"
|
|
"fmuls 88(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 92(%1);\n\t"
|
|
"fmuls 92(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 96(%1);\n\t"
|
|
"fmuls 96(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 100(%1);\n\t"
|
|
"fmuls 100(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 104(%1);\n\t"
|
|
"fmuls 104(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 108(%1);\n\t"
|
|
"fmuls 108(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 112(%1);\n\t"
|
|
"fmuls 112(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 116(%1);\n\t"
|
|
"fmuls 116(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 120(%1);\n\t"
|
|
"fmuls 120(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 124(%1);\n\t"
|
|
"fmuls 124(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 128(%1);\n\t"
|
|
"fmuls 128(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 132(%1);\n\t"
|
|
"fmuls 132(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 136(%1);\n\t"
|
|
"fmuls 136(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 140(%1);\n\t"
|
|
"fmuls 140(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 144(%1);\n\t"
|
|
"fmuls 144(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 148(%1);\n\t"
|
|
"fmuls 148(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 152(%1);\n\t"
|
|
"fmuls 152(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 156(%1);\n\t"
|
|
"fmuls 156(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 160(%1);\n\t"
|
|
"fmuls 160(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 164(%1);\n\t"
|
|
"fmuls 164(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 168(%1);\n\t"
|
|
"fmuls 168(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 172(%1);\n\t"
|
|
"fmuls 172(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 176(%1);\n\t"
|
|
"fmuls 176(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 180(%1);\n\t"
|
|
"fmuls 180(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 184(%1);\n\t"
|
|
"fmuls 184(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 188(%1);\n\t"
|
|
"fmuls 188(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 192(%1);\n\t"
|
|
"fmuls 192(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 196(%1);\n\t"
|
|
"fmuls 196(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 200(%1);\n\t"
|
|
"fmuls 200(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 204(%1);\n\t"
|
|
"fmuls 204(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"flds 208(%1);\n\t"
|
|
"fmuls 208(%2);\n\t"
|
|
"fxch %%st(2);\n\t"
|
|
"faddp;\n\t"
|
|
"faddp;\n\t":"=t" (f):"r"(a), "r"(b):"memory");
|
|
return f;
|
|
|
|
default:
|
|
fprintf(stderr,
|
|
"Warning: optimize __mac_c(..., ..., %d)\n", size);
|
|
return __mac_g(a, b, size);
|
|
}
|
|
}
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
#endif /* _FILTER_I386_H */
|