* Added IDCT Altivec optimization [MacOS X port]. Untested, not compiled.

9ab4320a · Christophe Massiot · 63f29665 · 9ab4320a · 9ab4320a · 9ab4320a
Commit 9ab4320a authored Mar 20, 2001 by Christophe Massiot
Hide whitespace changes
Inline Side-by-side

Showing with 426 additions and 1 deletion

Makefile.in Makefile.in +13 -1

plugins/idct/idctaltivec.c plugins/idct/idctaltivec.c +175 -0

plugins/idct/idctaltivec.h plugins/idct/idctaltivec.h +238 -0

No files found.
--- a/Makefile.in
+++ b/Makefile.in
@@ -368,6 +368,9 @@ PLUGIN_IDCTMMX =	plugins/idct/idctmmx.o \
 PLUGIN_IDCTMMXEXT =	plugins/idct/idctmmxext.o \
 			plugins/idct/idct_common.o

+PLUGIN_IDCTALTIVEC =	plugins/idct/idctaltivec.o \
+			plugins/idct/idct_common.o
+
 PLUGIN_MACOSX =	plugins/macosx/macosx.o \
 		plugins/macosx/intf_macosx.o \
 		plugins/macosx/vout_macosx.o
@@ -451,7 +454,8 @@ NONSTD_PLUGIN_OBJ = \
 		$(PLUGIN_X11) \
 		$(PLUGIN_GLIDE) \
 		$(PLUGIN_GTK) \
-		$(PLUGIN_GNOME)
+		$(PLUGIN_GNOME) \
+		$(PLUGIN_IDCT_ALTIVEC)

 NONSTD_CPP_PLUGIN_OBJ = \
 		$(PLUGIN_BEOS) \
@@ -628,6 +632,11 @@ $(PLUGIN_BEOS): %.o: .dep/%.dpp
 $(PLUGIN_BEOS): %.o: %.cpp
 	$(CC) $(CFLAGS) $(PCFLAGS) -c -o $@ $<

+$(PLUGIN_IDCTALTIVEC): %.o: Makefile.dep
+$(PLUGIN_IDCTALTIVEC): %.o: .dep/%.d
+$(PLUGIN_IDCTALTIVEC): %.o: %.c
+	$(CC) $(CFLAGS) $(PCFLAGS) -c -o $@ $< -faltivec
+
 #
 # Main application target
 #
@@ -793,6 +802,9 @@ lib/idctmmx.so: $(PLUGIN_IDCTMMX)

 lib/idctmmxext.so: $(PLUGIN_IDCTMMXEXT)
 	$(CC) $(PCFLAGS) $(PLCFLAGS) -o $@ $^
+
+lib/idctaltivec.so: $(PLUGIN_IDCTALTIVEC)
+	$(CC) $(PCFLAGS) $(PLCFLAGS) -o $@ $^ -framework VecLib
 endif

 ################################################################################

--- a/plugins/idct/idctaltivec.c
+++ b/plugins/idct/idctaltivec.c
+/*****************************************************************************
+ * idctaltivec.c : Altivec IDCT module
+ *****************************************************************************
+ * Copyright (C) 1999, 2000 VideoLAN
+ * $Id: idctaltivec.c,v 1.1 2001/03/20 20:09:37 massiot Exp $
+ *
+ * Authors: Christophe Massiot <massiot@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#define MODULE_NAME idctmmxext
+
+/*****************************************************************************
+ * Preamble
+ *****************************************************************************/
+#include "defs.h"
+
+#include <stdlib.h>
+
+#include "config.h"
+#include "common.h"
+#include "threads.h"
+#include "mtime.h"
+#include "tests.h"                                              /* TestCPU() */
+
+#include "video.h"
+#include "video_output.h"
+
+#include "video_decoder.h"
+
+#include "modules.h"
+#include "modules_inner.h"
+
+#include "idct.h"
+
+#include "idctaltivec.h"
+
+/*****************************************************************************
+ * Local prototypes.
+ *****************************************************************************/
+static void idct_getfunctions( function_list_t * p_function_list );
+static int  idct_Probe      ( probedata_t *p_data );
+static void vdec_NormScan   ( u8 ppi_scan[2][64] );
+
+
+/*****************************************************************************
+ * Build configuration tree.
+ *****************************************************************************/
+MODULE_CONFIG_START
+ADD_WINDOW( "Configuration for Altivec IDCT module" )
+    ADD_COMMENT( "Ha, ha -- nothing to configure yet" )
+MODULE_CONFIG_END
+
+/*****************************************************************************
+ * InitModule: get the module structure and configuration.
+ *****************************************************************************
+ * We have to fill psz_name, psz_longname and psz_version. These variables
+ * will be strdup()ed later by the main application because the module can
+ * be unloaded later to save memory, and we want to be able to access this
+ * data even after the module has been unloaded.
+ *****************************************************************************/
+MODULE_INIT
+{
+    p_module->psz_name = MODULE_STRING;
+    p_module->psz_longname = "Altivec IDCT module";
+    p_module->psz_version = VERSION;
+
+    p_module->i_capabilities = MODULE_CAPABILITY_NULL
+                                | MODULE_CAPABILITY_IDCT;
+
+    return( 0 );
+}
+
+/*****************************************************************************
+ * ActivateModule: set the module to an usable state.
+ *****************************************************************************
+ * This function fills the capability functions and the configuration
+ * structure. Once ActivateModule() has been called, the i_usage can
+ * be set to 0 and calls to NeedModule() be made to increment it. To unload
+ * the module, one has to wait until i_usage == 0 and call DeactivateModule().
+ *****************************************************************************/
+MODULE_ACTIVATE
+{
+    p_module->p_functions = malloc( sizeof( module_functions_t ) );
+    if( p_module->p_functions == NULL )
+    {
+        return( -1 );
+    }
+
+    idct_getfunctions( &p_module->p_functions->idct );
+
+    p_module->p_config = p_config;
+
+    return( 0 );
+}
+
+/*****************************************************************************
+ * DeactivateModule: make sure the module can be unloaded.
+ *****************************************************************************
+ * This function must only be called when i_usage == 0. If it successfully
+ * returns, i_usage can be set to -1 and the module unloaded. Be careful to
+ * lock usage_lock during the whole process.
+ *****************************************************************************/
+MODULE_DEACTIVATE
+{
+    free( p_module->p_functions );
+
+    return( 0 );
+}
+
+/* Following functions are local */
+
+/*****************************************************************************
+ * Functions exported as capabilities.
+ *****************************************************************************/
+static void idct_getfunctions( function_list_t * p_function_list )
+{
+    p_function_list->pf_probe = idct_Probe;
+    p_function_list->functions.idct.pf_init = vdec_InitIDCT;
+    p_function_list->functions.idct.pf_sparse_idct = vdec_SparseIDCT;
+    p_function_list->functions.idct.pf_idct = vdec_IDCT;
+    p_function_list->functions.idct.pf_norm_scan = vdec_NormScan;
+}
+
+/*****************************************************************************
+ * idct_Probe: return a preference score
+ *****************************************************************************/
+static int idct_Probe( probedata_t *p_data )
+{
+    if( TestCPU( CPU_CAPABILITY_ALTIVEC ) )
+    {
+        if( TestMethod( IDCT_METHOD_VAR, "idctaltivec" ) )
+        {
+            return( 999 );
+        }
+        else
+        {
+            return( 200 );
+        }
+    }
+    else
+    {
+        return( 0 );
+    }
+}
+
+/*****************************************************************************
+ * vdec_NormScan : Soon, transpose
+ *****************************************************************************/
+static void vdec_NormScan( u8 ppi_scan[2][64] )
+{
+}
+
+/*****************************************************************************
+ * vdec_IDCT :
+ *****************************************************************************/
+void vdec_IDCT( vdec_thread_t * p_vdec, dctelem_t * p_block,
+                int i_idontcare )
+{
+    IDCT( p_block, p_block );
+}
+
--- a/plugins/idct/idctaltivec.h
+++ b/plugins/idct/idctaltivec.h
+/***************************************************************
+ *
+ * Copyright:   (c) Copyright Motorola Inc. 1998
+ *
+ * Date:        April 17, 1998
+ *
+ * Function:    Matrix_Transpose
+ *
+ * Description: The following Matrix Transpose is adapted
+ *              from an algorithm developed by Brett Olsson
+ *              from IBM. It performs a 8x8 16-bit element
+ *              full matrix transpose.
+ *
+ * Inputs:      array elements stored in input
+ *               input[0] = [ 00 01 02 03 04 05 06 07 ]
+ *               input[1] = [ 10 11 12 13 14 15 16 17 ]
+ *               input[2] = [ 20 21 22 23 24 25 26 27 ]
+ *               input[3] = [ 30 31 32 33 34 35 36 37 ]
+ *               input[4] = [ 40 41 42 43 44 45 46 47 ]
+ *               input[5] = [ 50 51 52 53 54 55 56 57 ]
+ *               input[6] = [ 60 61 62 63 64 65 66 67 ]
+ *               input[7] = [ 70 71 72 73 74 75 76 77 ]
+ *
+ * Outputs:     transposed elements in output
+ *
+ **************************************************************/
+
+static __inline__ void Matrix_Transpose ( vector signed short *input,
+                               vector signed short *output )
+{
+  vector signed short a0, a1, a2, a3, a4, a5, a6, a7;
+  vector signed short b0, b1, b2, b3, b4, b5, b6, b7;
+
+  b0 = vec_mergeh( input[0], input[4] );     /* [ 00 40 01 41 02 42 03 43 ]*/
+  b1 = vec_mergel( input[0], input[4] );     /* [ 04 44 05 45 06 46 07 47 ]*/
+  b2 = vec_mergeh( input[1], input[5] );     /* [ 10 50 11 51 12 52 13 53 ]*/
+  b3 = vec_mergel( input[1], input[5] );     /* [ 14 54 15 55 16 56 17 57 ]*/
+  b4 = vec_mergeh( input[2], input[6] );     /* [ 20 60 21 61 22 62 23 63 ]*/
+  b5 = vec_mergel( input[2], input[6] );     /* [ 24 64 25 65 26 66 27 67 ]*/
+  b6 = vec_mergeh( input[3], input[7] );     /* [ 30 70 31 71 32 72 33 73 ]*/
+  b7 = vec_mergel( input[3], input[7] );     /* [ 34 74 35 75 36 76 37 77 ]*/
+
+  a0 = vec_mergeh( b0, b4 );                 /* [ 00 20 40 60 01 21 41 61 ]*/
+  a1 = vec_mergel( b0, b4 );                 /* [ 02 22 42 62 03 23 43 63 ]*/
+  a2 = vec_mergeh( b1, b5 );                 /* [ 04 24 44 64 05 25 45 65 ]*/
+  a3 = vec_mergel( b1, b5 );                 /* [ 06 26 46 66 07 27 47 67 ]*/
+  a4 = vec_mergeh( b2, b6 );                 /* [ 10 30 50 70 11 31 51 71 ]*/
+  a5 = vec_mergel( b2, b6 );                 /* [ 12 32 52 72 13 33 53 73 ]*/
+  a6 = vec_mergeh( b3, b7 );                 /* [ 14 34 54 74 15 35 55 75 ]*/
+  a7 = vec_mergel( b3, b7 );                 /* [ 16 36 56 76 17 37 57 77 ]*/
+
+  output[0] = vec_mergeh( a0, a4 );          /* [ 00 10 20 30 40 50 60 70 ]*/
+  output[1] = vec_mergel( a0, a4 );          /* [ 01 11 21 31 41 51 61 71 ]*/
+  output[2] = vec_mergeh( a1, a5 );          /* [ 02 12 22 32 42 52 62 72 ]*/
+  output[3] = vec_mergel( a1, a5 );          /* [ 03 13 23 33 43 53 63 73 ]*/
+  output[4] = vec_mergeh( a2, a6 );          /* [ 04 14 24 34 44 54 64 74 ]*/
+  output[5] = vec_mergel( a2, a6 );          /* [ 05 15 25 35 45 55 65 75 ]*/
+  output[6] = vec_mergeh( a3, a7 );          /* [ 06 16 26 36 46 56 66 76 ]*/
+  output[7] = vec_mergel( a3, a7 );          /* [ 07 17 27 37 47 57 67 77 ]*/
+
+}
+
+
+/***************************************************************
+ *
+ * Copyright:   (c) Copyright Motorola Inc. 1998
+ *
+ * Date:        April 20, 1998
+ *
+ * Macro:       IDCT_Transform
+ *
+ * Description: Discrete Cosign Transform implemented by the
+ *              Scaled Chen (III) Algorithm developed by Haifa
+ *              Research Lab.  The major difference between this
+ *              algorithm and the Scaled Chen (I) is that
+ *              certain multiply-subtracts are replaced by
+ *              multiply adds.  A full description of the
+ *              Scaled Chen (I) algorithm can be found in:
+ *              W.C.Chen, C.H.Smith and S.C.Fralick, "A Fast
+ *              Computational Algorithm for the Discrete Cosine
+ *              Transform", IEEE Transactions on Commnuications,
+ *              Vol. COM-25, No. 9, pp 1004-1009, Sept. 1997.
+ *
+ * Inputs:      vx     : array of vector short
+ *              t1-t10 : temporary vector variables set up by caller
+ *              c4     : cos(4*pi/16)
+ *              mc4    : -c4
+ *              a0     : c6/c2
+ *              a1     : c7/c1
+ *              a2     : c5/c3
+ *              ma2    : -a2
+ *              zero   : an array of zero elements
+ *
+ * Outputs:     vy     : array of vector short
+ *
+ **************************************************************/
+
+#define IDCT_Transform(vx,vy) \
+                                                                  \
+  /* 1st stage. */                                                \
+  t9 = vec_mradds( a1, vx[1], zero );  /* t8 = (a1) * x1 - x7  */ \
+  t8 = vec_subs( t9, vx[7]);                                      \
+  t1 = vec_mradds( a1, vx[7], vx[1] ); /* t1 = (a1) * x7 + x1  */ \
+  t7 = vec_mradds( a2, vx[5], vx[3] ); /* t7 = (a2) * x5 + x3  */ \
+  t3 = vec_mradds( ma2, vx[3], vx[5] );/* t3 = (-a2) * x5 + x3 */ \
+                                                                  \
+  /* 2nd stage */                                                 \
+  t5 = vec_adds( vx[0], vx[4] );        /* t5 = x0 + x4 */        \
+  t0 = vec_subs( vx[0], vx[4] );        /* t0 = x0 - x4 */        \
+  t9 = vec_mradds( a0, vx[2], zero );   /* t4 = (a0) * x2 - x6 */ \
+  t4 = vec_subs( t9, vx[6] );                                     \
+  t2 = vec_mradds( a0, vx[6], vx[2] );  /* t2 = (a0) * x6 + x2 */ \
+                                                                  \
+  t6 = vec_adds( t8, t3 );              /* t6 = t8 + t3 */        \
+  t3 = vec_subs( t8, t3 );              /* t3 = t8 - t3 */        \
+  t8 = vec_subs( t1, t7 );              /* t8 = t1 - t7 */        \
+  t1 = vec_adds( t1, t7 );              /* t1 = t1 + t7 */        \
+                                                                  \
+  /* 3rd stage. */                                                \
+  t7 = vec_adds( t5, t2 );              /* t7 = t5 + t2 */        \
+  t2 = vec_subs( t5, t2 );              /* t2 = t5 - t2 */        \
+  t5 = vec_adds( t0, t4 );              /* t5 = t0 + t4 */        \
+  t0 = vec_subs( t0, t4 );              /* t0 = t0 - t4 */        \
+                                                                  \
+  t4 = vec_subs( t8, t3 );              /* t4 = t8 - t3 */        \
+  t3 = vec_adds( t8, t3 );              /* t3 = t8 + t3 */        \
+                                                                  \
+  /* 4th stage. */                                                \
+  vy[0] = vec_adds( t7, t1 );        /* y0 = t7 + t1 */           \
+  vy[7] = vec_subs( t7, t1 );        /* y7 = t7 - t1 */           \
+  vy[1] = vec_mradds( c4, t3, t5 );  /* y1 = (c4) * t3 + t5  */   \
+  vy[6] = vec_mradds( mc4, t3, t5 ); /* y6 = (-c4) * t3 + t5 */   \
+  vy[2] = vec_mradds( c4, t4, t0 );  /* y2 = (c4) * t4 + t0  */   \
+  vy[5] = vec_mradds( mc4, t4, t0 ); /* y5 = (-c4) * t4 + t0 */   \
+  vy[3] = vec_adds( t2, t6 );        /* y3 = t2 + t6 */           \
+  vy[4] = vec_subs( t2, t6 );        /* y4 = t2 - t6 */
+
+
+/* Pre-Scaling matrix -- scaled by 1 */
+static vector signed short PreScale[8] = {
+    (vector signed short)( 4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681 ),
+    (vector signed short)( 5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880 ),
+    (vector signed short)( 5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422 ),
+    (vector signed short)( 4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680 ),
+    (vector signed short)( 4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681 ),
+    (vector signed short)( 4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680 ),
+    (vector signed short)( 5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422 ),
+    (vector signed short)( 5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880 )
+};
+
+/***************************************************************
+ *
+ * Copyright:   (c) Copyright Motorola Inc. 1998
+ *
+ * Date:        April 17, 1998
+ *
+ * Function:    IDCT
+ *
+ * Description: Scaled Chen (III) algorithm for IDCT
+ *              Arithmetic is 16-bit fixed point.
+ *
+ * Inputs:      input - Pointer to input data (short), which
+ *                      must be between -2048 to +2047.
+ *                      It is assumed that the allocated array
+ *                      has been 128-bit aligned and contains
+ *                      8x8 short elements.
+ *
+ * Outputs:     output - Pointer to output area for the transfored
+ *                       data. The output values are between -255
+ *                       and 255 . It is assumed that a 128-bit
+ *                       aligned 8x8 array of short has been
+ *                       pre-allocated.
+ *
+ * Return:      None
+ *
+ ***************************************************************/
+
+static __inline__ void IDCT(short *input, short *output) {
+
+  vector signed short t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
+  vector signed short a0, a1, a2, ma2, c4, mc4, zero;
+  vector signed short vx[8], vy[8];
+  vector signed short *vec_ptr;  /* used for conversion between
+                                    arrays of short and vector
+                                    signed short array.  */
+
+
+  /* Load the multiplication constants.  Note: these constants
+   * could all be loaded directly ( like zero case ), but using the
+   * SpecialConstants approach causes vsplth instructions to be
+   * generated instead of lvx which is more efficient given the remainder
+   * of the instruction mix.
+   */
+  vector signed short SpecialConstants =
+     (vector signed short)( 23170, 13573, 6518, 21895, -23170, -21895, 0 , 0
+);
+
+  c4   = vec_splat( SpecialConstants, 0 );  /* c4 = cos(4*pi/16)  */
+  a0   = vec_splat( SpecialConstants, 1 );  /* a0 = c6/c2         */
+  a1   = vec_splat( SpecialConstants, 2 );  /* a1 = c7/c1         */
+  a2   = vec_splat( SpecialConstants, 3 );  /* a2 = c5/c3         */
+  mc4  = vec_splat( SpecialConstants, 4 );  /* -c4                */
+  ma2  = vec_splat( SpecialConstants, 5 );  /* -a2                */
+  zero = (vector signed short)(0);
+
+  /* Load the rows of input data and Pre-Scale them. */
+  vec_ptr = ( vector signed short * ) input;
+  vx[0] = vec_mradds( vec_ptr[0], PreScale[0], zero );
+  vx[1] = vec_mradds( vec_ptr[1], PreScale[1], zero );
+  vx[2] = vec_mradds( vec_ptr[2], PreScale[2], zero );
+  vx[3] = vec_mradds( vec_ptr[3], PreScale[3], zero );
+  vx[4] = vec_mradds( vec_ptr[4], PreScale[4], zero );
+  vx[5] = vec_mradds( vec_ptr[5], PreScale[5], zero );
+  vx[6] = vec_mradds( vec_ptr[6], PreScale[6], zero );
+  vx[7] = vec_mradds( vec_ptr[7], PreScale[7], zero );
+
+  /* Perform IDCT first on the 8 columns */
+  IDCT_Transform( vx, vy );
+
+  /* Transpose matrix to work on rows */
+  Matrix_Transpose( vy, vx );
+
+  /* Perform IDCT next on the 8 rows */
+  IDCT_Transform( vx, vy );
+
+  /* Post-scale and store result. */
+  vec_ptr = (vector signed short *) output;
+  vec_ptr[0] = vy[0];
+  vec_ptr[1] = vy[1];
+  vec_ptr[2] = vy[2];
+  vec_ptr[3] = vy[3];
+  vec_ptr[4] = vy[4];
+  vec_ptr[5] = vy[5];
+  vec_ptr[6] = vy[6];
+  vec_ptr[7] = vy[7];
+
+}
+