/* * filter-i386.h -- optimized filter routines * * Copyright (C) 1996 * Thomas Sailer (sailer@ife.ee.ethz.ch, hb9jnx@hb9w.che.eu) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* ---------------------------------------------------------------------- */ #ifndef _FILTER_I386_H #define _FILTER_I386_H /* ---------------------------------------------------------------------- */ #define __HAVE_ARCH_MAC #define mac(a,b,size) \ (__builtin_constant_p(size) ? __mac_c((a),(b),(size)) : __mac_g((a),(b),(size))) #include extern inline float __mac_g(const float *a, const float *b, unsigned int size) { float sum = 0; unsigned int i; for (i = 0; i < size; i++) sum += (*a++) * (*b++); return sum; } extern inline float __mac_c(const float *a, const float *b, unsigned int size) { float f; /* * inspired from Phil Karn, KA9Q's home page */ switch (size) { case 53: asm volatile ("flds (%1);\n\t" "fmuls (%2);\n\t" "flds 4(%1);\n\t" "fmuls 4(%2);\n\t" "flds 8(%1);\n\t" "fmuls 8(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 12(%1);\n\t" "fmuls 12(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 16(%1);\n\t" "fmuls 16(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 20(%1);\n\t" "fmuls 20(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 24(%1);\n\t" "fmuls 24(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 28(%1);\n\t" "fmuls 28(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 32(%1);\n\t" "fmuls 32(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 36(%1);\n\t" "fmuls 36(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 40(%1);\n\t" "fmuls 40(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 44(%1);\n\t" "fmuls 44(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 48(%1);\n\t" "fmuls 48(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 52(%1);\n\t" "fmuls 52(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 56(%1);\n\t" "fmuls 56(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 60(%1);\n\t" "fmuls 60(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 64(%1);\n\t" "fmuls 64(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 68(%1);\n\t" "fmuls 68(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 72(%1);\n\t" "fmuls 72(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 76(%1);\n\t" "fmuls 76(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 80(%1);\n\t" "fmuls 80(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 84(%1);\n\t" "fmuls 84(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 88(%1);\n\t" "fmuls 88(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 92(%1);\n\t" "fmuls 92(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 96(%1);\n\t" "fmuls 96(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 100(%1);\n\t" "fmuls 100(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 104(%1);\n\t" "fmuls 104(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 108(%1);\n\t" "fmuls 108(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 112(%1);\n\t" "fmuls 112(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 116(%1);\n\t" "fmuls 116(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 120(%1);\n\t" "fmuls 120(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 124(%1);\n\t" "fmuls 124(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 128(%1);\n\t" "fmuls 128(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 132(%1);\n\t" "fmuls 132(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 136(%1);\n\t" "fmuls 136(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 140(%1);\n\t" "fmuls 140(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 144(%1);\n\t" "fmuls 144(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 148(%1);\n\t" "fmuls 148(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 152(%1);\n\t" "fmuls 152(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 156(%1);\n\t" "fmuls 156(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 160(%1);\n\t" "fmuls 160(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 164(%1);\n\t" "fmuls 164(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 168(%1);\n\t" "fmuls 168(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 172(%1);\n\t" "fmuls 172(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 176(%1);\n\t" "fmuls 176(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 180(%1);\n\t" "fmuls 180(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 184(%1);\n\t" "fmuls 184(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 188(%1);\n\t" "fmuls 188(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 192(%1);\n\t" "fmuls 192(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 196(%1);\n\t" "fmuls 196(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 200(%1);\n\t" "fmuls 200(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 204(%1);\n\t" "fmuls 204(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "flds 208(%1);\n\t" "fmuls 208(%2);\n\t" "fxch %%st(2);\n\t" "faddp;\n\t" "faddp;\n\t":"=t" (f):"r"(a), "r"(b):"memory"); return f; default: fprintf(stderr, "Warning: optimize __mac_c(..., ..., %d)\n", size); return __mac_g(a, b, size); } } /* ---------------------------------------------------------------------- */ #endif /* _FILTER_I386_H */