i965_drv_video: add video processing kernels

20975a94 · Xiang, Haihao · 5f2030ba · 20975a94 · 20975a94 · 20975a94
Commit 20975a94 authored Sep 02, 2010 by Xiang, Haihao
109 changed files
--- a/configure.ac
+++ b/configure.ac
@@ -163,6 +163,7 @@ AC_OUTPUT([
 	i965_drv_video/shaders/mpeg2/Makefile
 	i965_drv_video/shaders/mpeg2/vld/Makefile
 	i965_drv_video/shaders/render/Makefile
+	i965_drv_video/shaders/post_processing/Makefile
 	test/Makefile
 	test/basic/Makefile
 	test/decode/Makefile

--- a/i965_drv_video/shaders/Makefile.am
+++ b/i965_drv_video/shaders/Makefile.am
-SUBDIRS = h264 mpeg2 render
+SUBDIRS = h264 mpeg2 render post_processing
--- a/i965_drv_video/shaders/post_processing/Common/AYUV_Load_16x8.asm
+++ b/i965_drv_video/shaders/post_processing/Common/AYUV_Load_16x8.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: AYUV_Load_16x8.asm
+//----------------------------------------------------------------
+#include "AYUV_Load_16x8.inc"
+// In order to load 64x8 AYUV data (16x8 pixels), we need to divide the data 
+// into two regions and load them separately. 
+//
+//       32 byte         32 byte
+//|----------------|----------------|
+//|                |                |
+//|       A        |       B        |8
+//|                |                |
+//|                |                |
+//|----------------|----------------|
+// Load the first 32x8 data block
+// Packed data block should be loaded as 32x8 pixel block
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w    // Source Block origin
+    shl  (1) rMSGSRC.0<1>:d     acc0:w            2:w          { NoDDClr }      // H. block origin need to be four times larger
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_YUV:ud         { NoDDChk }      // Block width and height (32x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_YUV(0)<1>    mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV+nBI_CURRENT_SRC_YUV:ud
+//Load the second 32x8 data block    
+// Offset the origin X - move to next 32 colomns
+    add (1) rMSGSRC.0<1>:d    rMSGSRC.0<0;1,0>:d    32:w                        // Increase X origin by 8 
+// Size stays the same - 32x8
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud                               // Copy message description to message header
+    send (8) udSRC_YUV(8)<1>    mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV+nBI_CURRENT_SRC_YUV:ud
+// Give AYUV region addresses to address register
+    mov (1) SRC_YUV_OFFSET<1>:ud 0x00400038*32:ud                               //Address registers contain starting addresses of two halves 
+//Directly move the data to destination
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (16) uwDEST_Y(%1)<1> r[SRC_YUV_OFFSET,%1*32+2]<8,4>:ub
+        mov (16) uwDEST_U(%1)<1> r[SRC_YUV_OFFSET,%1*32+1]<8,4>:ub
+        mov (16) uwDEST_V(%1)<1> r[SRC_YUV_OFFSET,%1*32+0]<8,4>:ub
+    }        
\ No newline at end of file
--- a/i965_drv_video/shaders/post_processing/Common/AYUV_Load_16x8.inc
+++ b/i965_drv_video/shaders/post_processing/Common/AYUV_Load_16x8.inc
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: AYUV_Load_16x8.inc
+//
+// AYUV data are first loaded to bottom I/O REGION_2, then unpacked to planar data
+// and stored in top I/O REGION_1
+#undef 	nY_NUM_OF_ROWS
+#define nY_NUM_OF_ROWS      8       // Number of Y rows per block
+#define nDPR_BLOCK_SIZE_YUV           nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8   // Y block size 32x8
+#define nDPR_MSG_SIZE_YUV             nRESLEN_8                         // # of MRF's to hold Y block data (8)
+//Temporary storage for unpacked AYUV data
+#define     rUNPACK_TEMP     REG(r,nTEMP0)
+.declare    udUNPACK_TEMP    Base=rUNPACK_TEMP    ElementSize=4    SrcRegion=<8;8,1>    Type=ud        //1 GRF
+.declare    ubUNPACK_TEMP    Base=rUNPACK_TEMP    ElementSize=1    SrcRegion=<32;32,1>    Type=ub        //1 GRF
+.declare ubBOT_Y_IO     Base=REG(r,nBOT_Y) ElementSize=1 SrcRegion=REGION(32,1) Type=ub
+#define udSRC_YUV               udBOT_Y_IO
+#define ubSRC_YUV               ubBOT_Y_IO
+#define nSRC_YUV_REG            nBOT_Y
+#define uwDEST_Y                uwTOP_Y
+#define uwDEST_U                uwTOP_U
+#define uwDEST_V                uwTOP_V
+#define SRC_YUV_OFFSET a0.0
+#define nSRC_REGION nREGION_1    // REGION_1 will be the source region for first kernel
+// End of AYUV_Load_16x8.inc
--- a/i965_drv_video/shaders/post_processing/Common/Expansion.inc
+++ b/i965_drv_video/shaders/post_processing/Common/Expansion.inc
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: Expansion.inc
+// Number of U/V rows per block definition
+#undef 	nUV_NUM_OF_ROWS
+#ifdef EXPAND_9x5
+	#define nUV_NUM_OF_ROWS     6
+#else
+	#define nUV_NUM_OF_ROWS     8
+#endif
+// Source/destination region definitions
+#undef uwDEST_U
+#undef uwDEST_V
+#if (nSRC_REGION==nREGION_1)
+	#define uwDEST_U          uwTOP_U
+	#define uwDEST_V          uwTOP_V
+#elif (nSRC_REGION==nREGION_2)
+	#define uwDEST_U          uwBOT_U
+	#define uwDEST_V          uwBOT_V
+#endif
+// End of Expansion.inc
--- a/i965_drv_video/shaders/post_processing/Common/IMC3_Load_8x4.asm
+++ b/i965_drv_video/shaders/post_processing/Common/IMC3_Load_8x4.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: IMC3_Load_8x4.asm
+//
+//----------------------------------------------------------------
+#define  IMC3_LOAD_8x4
+#include "PL3_Load.inc"
+// Load 16x8 planar Y ----------------------------------------------------------
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+#if !defined(LOAD_UV_ONLY)
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_Y:ud                               // Block width and height (16x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_Y(0)<1>      mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
+#endif
+// Load 8x4 planar U and V -----------------------------------------------------
+    asr (2)  rMSGSRC.0<1>:d     rMSGSRC.0<2;2,1>:d       1:w   // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (8x4)
+    mov  (8) mMSGHDRU<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_U(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_U:ud
+    mov  (8) mMSGHDRV<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_V(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_V:ud
+// Convert to word-aligned format ----------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    $for (nY_NUM_OF_ROWS-1; >-1; -1) {
+        mov (16)  uwDEST_Y(0,%1*16)<1>         ubSRC_Y(0,%1*16)
+    }
+#endif
+    $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
+        mov (16)  uwDEST_U(0, %1*16)<1>        ubSRC_U(0, %1*16)
+        mov (16)  uwDEST_V(0, %1*16)<1>        ubSRC_V(0, %1*16)
+    }
+// End of IMC3_Load_8x4
--- a/i965_drv_video/shaders/post_processing/Common/IMC3_Load_8x5.asm
+++ b/i965_drv_video/shaders/post_processing/Common/IMC3_Load_8x5.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: IMC3_Load_8x5.asm
+//
+//----------------------------------------------------------------
+#define  IMC3_LOAD_8x5
+#include "PL3_Load.inc"
+// Load 16x8 planar Y ----------------------------------------------------------
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+#if !defined(LOAD_UV_ONLY)
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_Y:ud                               // Block width and height (16x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_Y(0)<1>      mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
+#endif
+// Load 8x5 planar U and V -----------------------------------------------------
+    asr (2)  rMSGSRC.0<1>:d     rMSGSRC.0<2;2,1>:d       1:w   // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (8x5)
+    mov  (8) mMSGHDRU<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_U(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_U:ud
+    mov  (8) mMSGHDRV<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_V(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_V:ud
+// Convert to word-aligned format ----------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    $for (nY_NUM_OF_ROWS-1; >-1; -1) {
+        mov (16)  uwDEST_Y(0,%1*16)<1>         ubSRC_Y(0,%1*16)
+    }
+#endif
+    $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
+        mov (16)  uwDEST_U(0, %1*16)<1>        ubSRC_U(0, %1*16)
+        mov (16)  uwDEST_V(0, %1*16)<1>        ubSRC_V(0, %1*16)
+    }
+// End of IMC3_Load_8x5
--- a/i965_drv_video/shaders/post_processing/Common/IMC3_Load_9x5.asm
+++ b/i965_drv_video/shaders/post_processing/Common/IMC3_Load_9x5.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: IMC3_Load_9x5.asm
+//
+//----------------------------------------------------------------
+//  This module loads 16x8 Y, 9x5 U and 9x5 V planar data blocks for CSC module
+//	and stores it in byte-aligned format.
+//----------------------------------------------------------------
+#define  IMC3_LOAD_9x5
+#include "PL3_Load.inc"
+// Load 16x8 planar Y ----------------------------------------------------------
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+#if !defined(LOAD_UV_ONLY)
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_Y:ud                               // Block width and height (16x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_Y(0)<1>      mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
+#endif
+// Load 9x5 planar U and V -----------------------------------------------------
+    asr (2)  rMSGSRC.0<1>:d     rMSGSRC.0<2;2,1>:d       1:w   // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (12x5)
+    mov  (8) mMSGHDRU<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_U(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_U:ud
+    mov  (8) mMSGHDRV<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_V(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_V:ud
+// Convert to word-aligned format ----------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    $for (nY_NUM_OF_ROWS-1; >-1; -1) {
+        mov (16)  uwDEST_Y(0,%1*16)<1>         ubSRC_Y(0,%1*16)
+    }
+#endif
+    $for(nUV_NUM_OF_ROWS-2; >-1; -1) {
+        mov (16)  uwDEST_U(0, %1*16)<1>        ubSRC_U(0, %1*16)
+        mov (16)  uwDEST_V(0, %1*16)<1>        ubSRC_V(0, %1*16)
+    }
+// End of IMC3_Load_9x5
--- a/i965_drv_video/shaders/post_processing/Common/Init_All_Regs.asm
+++ b/i965_drv_video/shaders/post_processing/Common/Init_All_Regs.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+#ifdef GT	// to remove error messages of un-initialized GRF
+	.declare 	udGRF_space	 	 Base=r0.0 ElementSize=4 SrcRegion=REGION(8,1) Type=ud	
+	$for (7; <80; 1) {
+		mov (8) udGRF_space(%1)<1>	0:ud
+	}
+#else
+#endif
\ No newline at end of file
--- a/i965_drv_video/shaders/post_processing/Common/Multiple_Loop.asm
+++ b/i965_drv_video/shaders/post_processing/Common/Multiple_Loop.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+/////////////////////////////////////////////////////////////////////////////////
+// Multiple_Loop.asm
+// This lable is for satisfying component kernel build.
+// DL will remove this label and reference the real one in Multiple_Loop_Head.asm.
+#if defined(COMPONENT)
+VIDEO_PROCESSING_LOOP:
+#endif
+//===== Possible build flags for component kernels
+// 1) INC_SCALING
+// 2) INC_BLENDING
+// 3) INC_BLENDING and INC_SCALING
+// 4) (no flags)
+#define MxN_MULTIPLE_BLOCKS
+//------------------------------------------------------------------------------
+#if defined(MxN_MULTIPLE_BLOCKS)
+// Do Multiple Block Processing ------------------------------------------------
+	// The 1st block has been processed before entering the loop
+	// Processed all blocks?
+	add.z.f0.0	(1)	wNUM_BLKS:w	wNUM_BLKS:w	-1:w
+	// Reached multi-block width?
+	add			(1)	wORIX:w		wORIX:w		16:w
+	cmp.l.f0.1	(1)	null:w		acc0.0:w	wFRAME_ENDX:w	// acc0.0 has wORIX
+	#if defined(INC_SCALING)
+	// Update SRC_VID_H_ORI for scaling
+		mul	(1)	REG(r,nTEMP0):f		fVIDEO_STEP_X:f		16.0:f
+		add	(1)	fSRC_VID_H_ORI:f	REG(r,nTEMP0):f		fSRC_VID_H_ORI:f
+	#endif
+	#if defined(INC_BLENDING)
+	// Update SRC_ALPHA_H_ORI for blending
+		mul	(1)	REG(r,nTEMP0):f		fALPHA_STEP_X:f		16.0:f
+		add	(1)	fSRC_ALPHA_H_ORI:f	REG(r,nTEMP0):f		fSRC_ALPHA_H_ORI:f
+	#endif
+	(f0.0)jmpi	(1)	END_VIDEO_PROCESSING	// All blocks are done - Exit loop
+	(f0.1)jmpi	(1)	VIDEO_PROCESSING_LOOP	// If not the end of row, goto the beginning of the loop
+	//If end of row, restart Horizontal offset and calculate Vertical offsets next row.
+	mov	(1)		wORIX:w		wCOPY_ORIX:w
+	add	(1)		wORIY:w		wORIY:w			8:w
+	#if defined(INC_SCALING)
+	// Update SRC_VID_H_ORI and SRC_VID_V_ORI for scaling
+		mov	(1)		fSRC_VID_H_ORI:f	fFRAME_VID_ORIX:f	// Reset normalised X origin to 0 for video and alpha
+		mul	(1)		REG(r,nTEMP0):f		fVIDEO_STEP_Y:f		8.0:f
+		add	(1)		fSRC_VID_V_ORI:f	REG(r,nTEMP0):f		fSRC_VID_V_ORI:f
+	#endif
+	#if defined(INC_BLENDING)
+	// Update SRC_ALPHA_H_ORI and SRC_ALPHA_V_ORI for blending
+		mov	(1)		fSRC_ALPHA_H_ORI:f	fFRAME_ALPHA_ORIX:f	// Reset normalised X origin to 0 for video and alpha
+		mul	(1)		REG(r,nTEMP0):f		fALPHA_STEP_Y:f		8.0:f
+		add	(1)		fSRC_ALPHA_V_ORI:f	REG(r,nTEMP0):f		fSRC_ALPHA_V_ORI:f
+	#endif
+	jmpi (1)	VIDEO_PROCESSING_LOOP	// Continue Loop
+END_VIDEO_PROCESSING:
+	nop
+#endif
+END_THREAD	// End of Thread
\ No newline at end of file
--- a/i965_drv_video/shaders/post_processing/Common/Multiple_Loop_Head.asm
+++ b/i965_drv_video/shaders/post_processing/Common/Multiple_Loop_Head.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//////////////////////////////////////////////////////////////////////////////////
+// Multiple_Loop_Head.asm
+// This code sets up the loop control for multiple blocks per thread
+	mul (1)	wFRAME_ENDX:w	ubBLK_CNT_X:ub	16:uw	{ NoDDClr }				// Build multi-block loop counters
+	mov (1) wNUM_BLKS:w		ubNUM_BLKS:ub			{ NoDDClr, NoDDChk }	// Copy num blocks to word variable
+	mov (1) wCOPY_ORIX:w	wORIX:w					{ NoDDChk }				// Copy multi-block origin in pixel 
+	mov (2) fFRAME_VID_ORIX<1>:f			fSRC_VID_H_ORI<4;2,2>:f			// Copy src video origin for scaling, and alpha origin for blending
+	add (1)	wFRAME_ENDX:w	wFRAME_ENDX:w	wORIX:w							// Continue building multi-block loop counters
+VIDEO_PROCESSING_LOOP:		// Loop back entry point as the biginning of the loop for multiple blocks
+// Beginning of the loop
--- a/i965_drv_video/shaders/post_processing/Common/NV11_Load_4x8.asm
+++ b/i965_drv_video/shaders/post_processing/Common/NV11_Load_4x8.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: NV11_Load_4x8.asm
+//----------------------------------------------------------------
+#define  NV11_LOAD_4x8
+#include "PL2_Load.inc"
+// Load 16x8 NV11 Y ------------------------------------------------------------
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+#if !defined(LOAD_UV_ONLY)
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_Y:ud                               // Y block width and height (16x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_Y(0)<1>      mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
+#endif
+// Load 8x8 NV11 UV ----------------------------------------------------------
+    asr (1)  rMSGSRC.0<1>:d     rMSGSRC.0<0;1,0>:d       1:w   // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (8x8)
+    mov  (8) mMSGHDRU<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_U(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
+// Convert to word-aligned format ----------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    $for (nY_NUM_OF_ROWS-1; >-1; -1) {
+        mov (16)  uwDEST_Y(0,%1*16)<1>      ubSRC_Y(0,%1*16)
+    }
+#endif
+    $for (nUV_NUM_OF_ROWS/4-1; >-1; -1) {
+        mov (16)  uwDEST_U(0,%1*16)<1>      ubSRC_U(0,%1*32)<32;16,2>
+        mov (16)  uwDEST_V(0,%1*16)<1>      ubSRC_U(0,%1*32+1)<32;16,2>
+    }
+// End of NV11_Load_4x8
--- a/i965_drv_video/shaders/post_processing/Common/NV11_Load_5x8.asm
+++ b/i965_drv_video/shaders/post_processing/Common/NV11_Load_5x8.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: NV11_Load_5x8.asm
+//----------------------------------------------------------------
+#define  NV11_LOAD_5x8
+#include "PL2_Load.inc"
+// Load 16x8 NV11 Y ------------------------------------------------------------
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+#if !defined(LOAD_UV_ONLY)
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_Y:ud                               // Y block width and height (16x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_Y(0)<1>      mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
+#endif
+// Load 12x8 NV11 UV ---------------------------------------------------------
+    asr (1)  rMSGSRC.0<1>:d     rMSGSRC.0<0;1,0>:d       1:w   // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (12x8)
+    mov  (8) mMSGHDRU<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_U(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
+// Convert to word-aligned format ----------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    $for (nY_NUM_OF_ROWS-1; >-1; -1) {
+        mov (16)  uwDEST_Y(0,%1*16)<1>      ubSRC_Y(0,%1*16)
+    }
+#endif
+    $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
+        mov (16)  uwDEST_U(0,%1*16)<1>      ubSRC_U(0,%1*32)<16;8,2>
+        mov (16)  uwDEST_V(0,%1*16)<1>      ubSRC_U(0,%1*32+1)<16;8,2>
+    }
+// End of NV11_Load_5x8
--- a/i965_drv_video/shaders/post_processing/Common/NV12_Load_8x4.asm
+++ b/i965_drv_video/shaders/post_processing/Common/NV12_Load_8x4.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: NV12_Load_8x4.asm
+//----------------------------------------------------------------
+#define  NV12_LOAD_8x4
+#include "PL2_Load.inc"
+// Load 16x8 planar Y ----------------------------------------------------------
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+#if !defined(LOAD_UV_ONLY)
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_Y:ud                               // Y block width and height (16x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_Y(0)<1>      mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
+#endif
+// Load 8x4 planar U and V -----------------------------------------------------
+    asr (1)  rMSGSRC.1<1>:d     rMSGSRC.1<0;1,0>:d       1:w   // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (16x4)
+    mov  (8) mMSGHDRU<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_U(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
+// Convert to word-aligned format ----------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    $for (nY_NUM_OF_ROWS-1; >-1; -1) {
+        mov (16)  uwDEST_Y(0,%1*16)<1>      ubSRC_Y(0,%1*16)
+    }
+#endif
+    $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
+        mov (16)  uwDEST_U(0,%1*16)<1>      ubSRC_U(0,%1*32)<32;16,2>
+        mov (16)  uwDEST_V(0,%1*16)<1>      ubSRC_U(0,%1*32+1)<32;16,2>
+    }
+// End of NV12_Load_8x4
--- a/i965_drv_video/shaders/post_processing/Common/NV12_Load_8x5.asm
+++ b/i965_drv_video/shaders/post_processing/Common/NV12_Load_8x5.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: NV12_Load_8x5.asm
+//----------------------------------------------------------------
+#define  NV12_LOAD_8x5
+#include "PL2_Load.inc"
+// Load 16x8 planar Y ----------------------------------------------------------
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+#if !defined(LOAD_UV_ONLY)
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_Y:ud                               // Y block width and height (16x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_Y(0)<1>      mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
+#endif
+// Load 8x5 planar U and V -----------------------------------------------------
+    asr (1)  rMSGSRC.1<1>:d     rMSGSRC.1<0;1,0>:d       1:w   // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (16x5)
+    mov  (8) mMSGHDRU<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_U(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
+// Convert to word-aligned format ----------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    $for (nY_NUM_OF_ROWS-1; >-1; -1) {
+        mov (16)  uwDEST_Y(0,%1*16)<1>      ubSRC_Y(0,%1*16)
+    }
+#endif
+    $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
+        mov (16)  uwDEST_U(0,%1*16)<1>      ubSRC_U(0,%1*32)<16;8,2>
+        mov (16)  uwDEST_V(0,%1*16)<1>      ubSRC_U(0,%1*32+1)<16;8,2>
+    }
+// End of NV12_Load_8x5
--- a/i965_drv_video/shaders/post_processing/Common/NV12_Load_9x5.asm
+++ b/i965_drv_video/shaders/post_processing/Common/NV12_Load_9x5.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: NV12_Load_9x5.asm
+//----------------------------------------------------------------
+#define  NV12_LOAD_9x5
+#include "PL2_Load.inc"
+// Load 16x8 planar Y ----------------------------------------------------------
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+#if !defined(LOAD_UV_ONLY)
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_Y:ud                               // Y block width and height (16x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_Y(0)<1>      mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
+#endif
+// Load 9x5 planar U and V -----------------------------------------------------
+    asr (1)  rMSGSRC.1<1>:d     rMSGSRC.1<0;1,0>:d       1:w   // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (20x5)
+    mov  (8) mMSGHDRU<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_U(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
+// Convert to word-aligned format ----------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    $for (nY_NUM_OF_ROWS-1; >-1; -1) {
+        mov (16)  uwDEST_Y(0,%1*16)<1>      ubSRC_Y(0,%1*16)
+    }
+#endif
+	$for(nUV_NUM_OF_ROWS-2; >-1; -1) {
+        mov (16)  uwDEST_U(0,%1*16)<1>      ubSRC_U(0,%1*32)<16;8,2>
+        mov (16)  uwDEST_V(0,%1*16)<1>      ubSRC_U(0,%1*32+1)<16;8,2>
+    }
+// End of NV12_Load_9x5
--- a/i965_drv_video/shaders/post_processing/Common/P208_Load_8x8.asm
+++ b/i965_drv_video/shaders/post_processing/Common/P208_Load_8x8.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: P208_Load_8x8.asm
+//----------------------------------------------------------------
+#define  P208_LOAD_8x8
+#include "PL2_Load.inc"
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+    mov  (1) rMSGSRC.2<1>:ud	nDPR_BLOCK_SIZE_Y:ud							// Y Block width and height (16x8) (U/V block size is the same)
+// Load 16x8 P208 Y ------------------------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_Y(0)<1>      mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
+#endif
+	// Load 16x8 planar UV -----------------------------------------------------
+    mov  (8) mMSGHDRU<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_U(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
+// Convert to word-aligned format ----------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    $for (0; <nY_NUM_OF_ROWS; 1) {
+        mov	(16)	uwDEST_Y(0,%1*16)	ubSRC_Y(0,%1*16)
+    }
+#endif
+    $for (0; <nUV_NUM_OF_ROWS/2; 1) {
+        mov	(16)	uwDEST_U(0,%1*16)	ubSRC_U(0,%1*32)<32;16,2>
+        mov	(16)	uwDEST_V(0,%1*16)	ubSRC_U(0,%1*32+1)<32;16,2>
+    }
+// End of P208_Load_8x8.asm
--- a/i965_drv_video/shaders/post_processing/Common/P208_Load_9x8.asm
+++ b/i965_drv_video/shaders/post_processing/Common/P208_Load_9x8.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: P208_Load_9x8.asm
+//----------------------------------------------------------------
+#define  P208_LOAD_9x8
+#include "PL2_Load.inc"
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+// Load 16x8 P208 Y ------------------------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_Y:ud                               // Y block width and height (16x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_Y(0)<1>      mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
+#endif
+	// Load 16x8 planar UV -----------------------------------------------------
+    mov  (1) rMSGSRC.2<1>:ud	nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (20x8)
+    mov  (8) mMSGHDRU<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_U(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
+// Convert to word-aligned format ----------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    $for (0; <nY_NUM_OF_ROWS; 1) {
+        mov	(16)	uwDEST_Y(0,%1*16)	ubSRC_Y(0,%1*16)
+    }
+#endif
+    $for (0; <nUV_NUM_OF_ROWS; 1) {
+        mov	(16)	uwDEST_U(0,%1*16)	ubSRC_U(0,%1*32)<32;16,2>
+        mov	(16)	uwDEST_V(0,%1*16)	ubSRC_U(0,%1*32+1)<32;16,2>
+    }
+// End of P208_Load_9x8.asm
--- a/i965_drv_video/shaders/post_processing/Common/PA_Load.inc
+++ b/i965_drv_video/shaders/post_processing/Common/PA_Load.inc
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: PA_Load.inc
+//
+// YUV422 data are first loaded to bottom I/O REGION_2, then unpacked to planar data
+// and stored in top I/O REGION_1
+#undef 	nY_NUM_OF_ROWS
+#undef 	nUV_NUM_OF_ROWS
+#define nY_NUM_OF_ROWS      8       // Number of Y rows per block
+#define nUV_NUM_OF_ROWS     8       // Number of U/V rows per block
+#if defined(PA_LOAD_8x8)
+        #define nDPR_BLOCK_SIZE_YUV           nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8   // Y block size 32x8
+        #define nDPR_MSG_SIZE_YUV             nRESLEN_8                         // # of MRF's to hold Y block data (8)
+#endif
+#if defined(PA_LOAD_9x8)
+        #define nDPR_BLOCK_SIZE_YUV_MAIN      nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8   // Main YUV block size 32x8
+        #define nDPR_MSG_SIZE_YUV_MAIN        nRESLEN_8                         // # of MRF's to hold Y block data (8)
+        #define nDPR_BLOCK_SIZE_YUV_ADDITION  nBLOCK_WIDTH_4+nBLOCK_HEIGHT_8    // Additional YUV block size 4x8
+        #define nDPR_MSG_SIZE_YUV_ADDITION    nRESLEN_1                         // # of MRF's to hold Y block data (8)
+#endif
+#define udSRC_YUV               udBOT_Y_IO
+#define nSRC_YUV_REG            nBOT_Y
+#define uwDEST_Y                uwTOP_Y
+#define uwDEST_U                uwTOP_U
+#define uwDEST_V                uwTOP_V
+#define nSRC_REGION nREGION_1    // REGION_1 will be the source region for first kernel
+// End of PA_Load.inc
--- a/i965_drv_video/shaders/post_processing/Common/PA_Load_8x8.asm
+++ b/i965_drv_video/shaders/post_processing/Common/PA_Load_8x8.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: PA_Load_8x8.asm
+//----------------------------------------------------------------
+#define  PA_LOAD_8x8
+#include "PA_Load.inc"
+//  Load 16x8 packed data block
+//  Packed data block should be loaded as 32x8 pixel block
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Block origin
+    shl  (1) rMSGSRC.0<1>:d     acc0:w            1:w                              // H. block origin need to be doubled
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_YUV:ud                             // Block width and height (32x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_YUV(0)<1>    mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV+nBI_CURRENT_SRC_YUV:ud
+//  Unpack to "planar" YUV422 format in word-aligned bytes
+    add  (4) pCF_Y_OFFSET<1>:uw    ubSRC_CF_OFFSET<4;4,1>:ub    nSRC_YUV_REG*nGRFWIB:w    // Initial Y,U,V offset in YUV422 block
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (16)  uwDEST_Y(0, %1*16)<1>     r[pCF_Y_OFFSET, %1*nGRFWIB]REGION(16,2)
+        mov (8)   uwDEST_U(0, %1*8)<1>      r[pCF_U_OFFSET, %1*nGRFWIB]REGION(8,4)
+        mov (8)   uwDEST_V(0, %1*8)<1>      r[pCF_V_OFFSET, %1*nGRFWIB]REGION(8,4)
+    }
+// End of PA_Load_8x8
--- a/i965_drv_video/shaders/post_processing/Common/PA_Load_9x8.asm
+++ b/i965_drv_video/shaders/post_processing/Common/PA_Load_9x8.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: PA_Load_9x8.asm
+//----------------------------------------------------------------
+//  This module loads 16x8 Y, 9x8 U and 9x8 V planar data blocks for CSC module 
+//	and stores it in word-aligned format.
+//----------------------------------------------------------------
+#define  PA_LOAD_9x8
+#include "PA_Load.inc"
+//  Load 18x8 packed data block
+//  Packed data block should be loaded as 36x8 pixel block
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Block origin
+    shl  (1) rMSGSRC.0<1>:d     acc0:w            1:w                              // H. block origin need to be doubled
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_YUV_MAIN:ud                        // Block width and height (32x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_YUV(0)<1>    mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV_MAIN+nBI_CURRENT_SRC_YUV:ud
+    add  (1) rMSGSRC.0<1>:d     rMSGSRC.0:d       32:w                             //the last 4 pixels are read again for optimization
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_YUV_ADDITION:ud                    // Block width and height (4x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_YUV(8)<1>    mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV_ADDITION+nBI_CURRENT_SRC_YUV:ud
+//  Unpack to "planar" YUV422 format in word-aligned bytes
+    add  (4) pCF_Y_OFFSET<1>:uw    ubSRC_CF_OFFSET<4;4,1>:ub    nSRC_YUV_REG*nGRFWIB:w    // Initial Y,U,V offset in YUV422 block
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (16)  uwDEST_Y(0, %1*16)<1>     r[pCF_Y_OFFSET, %1*nGRFWIB]REGION(16,2)
+        mov (8)   uwDEST_U(0, %1*16)<1>     r[pCF_U_OFFSET, %1*nGRFWIB]REGION(8,4)
+        mov (8)   uwDEST_V(0, %1*16)<1>     r[pCF_V_OFFSET, %1*nGRFWIB]REGION(8,4)
+    }
+    $for(0; <nUV_NUM_OF_ROWS; 1) {
+        mov (1)   uwDEST_U(0, %1*16+8)<1>   r[pCF_U_OFFSET, %1*4+256]REGION(1,0)
+        mov (1)   uwDEST_V(0, %1*16+8)<1>   r[pCF_V_OFFSET, %1*4+256]REGION(1,0)
+    }
+	//UV expansion done in PL9x8_PL16x8.asm module
+// End of PA_Load_9x8
--- a/i965_drv_video/shaders/post_processing/Common/PL16x8_PL8x4.asm
+++ b/i965_drv_video/shaders/post_processing/Common/PL16x8_PL8x4.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: PL16x8_PL8x4.asm
+//----------------------------------------------------------------
+#include "common.inc"
+#ifndef DEST_U 	//DEST_U, DEST_V not defined
+	#if (nSRC_REGION==nREGION_1)
+		#define DEST_Y		uwTOP_Y
+		#define DEST_U		uwTOP_U
+		#define DEST_V		uwTOP_V
+	#elif (nSRC_REGION==nREGION_2)
+		#define DEST_Y		uwBOT_Y
+		#define DEST_U		uwBOT_U
+		#define DEST_V		uwBOT_V
+	#endif
+#endif
+//Convert 444 from sampler to 422
+$for (0, 0; <8; 2, 1) {
+	mov	(8)	DEST_U(0,%2*8)<1>	DEST_U(%1)<16;8,2>
+	mov	(8)	DEST_V(0,%2*8)<1>	DEST_V(%1)<16;8,2>	
+}
+// Re-define new number of lines
+#undef nUV_NUM_OF_ROWS
+#define nUV_NUM_OF_ROWS     4
--- a/i965_drv_video/shaders/post_processing/Common/PL16x8_PL8x8.asm
+++ b/i965_drv_video/shaders/post_processing/Common/PL16x8_PL8x8.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: PL16x8_PL8x8.asm
+//----------------------------------------------------------------
+#include "common.inc"
+#ifndef DEST_U
+	//DEST_U, DEST_V not defined
+	#if (nSRC_REGION==nREGION_1)
+		#define DEST_Y		uwTOP_Y
+		#define DEST_U		uwTOP_U
+		#define DEST_V		uwTOP_V
+	#elif (nSRC_REGION==nREGION_2)
+		#define DEST_Y		uwBOT_Y
+		#define DEST_U		uwBOT_U
+		#define DEST_V		uwBOT_V
+	#endif
+#endif
+//Convert 444 from sampler to 422
+$for (0, 0; <8; 2, 1) {
+	mov		DEST_U(%2)<1>	DEST_U(%1)<16;8,2>
+	mov		DEST_V(%2)<1>	DEST_V(%1)<16;8,2>	
+}
--- a/i965_drv_video/shaders/post_processing/Common/PL2_Load.inc
+++ b/i965_drv_video/shaders/post_processing/Common/PL2_Load.inc
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: PL2_Load.inc
+#undef 	nY_NUM_OF_ROWS
+#undef 	nUV_NUM_OF_ROWS
+#define nY_NUM_OF_ROWS      8                                         // Number of Y rows per block
+#define nDPR_BLOCK_SIZE_Y   nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8           // Y block size 16x8
+#define nDPR_MSG_SIZE_Y     nRESLEN_4                                 // # of MRF's to hold Y block data (4)
+#if defined(NV11_LOAD_4x8)
+        #define nUV_NUM_OF_ROWS     8                                 // Number of U/V rows per block
+        #define nDPR_BLOCK_SIZE_UV  nBLOCK_WIDTH_8+nBLOCK_HEIGHT_8    // U/V block size 8x8
+        #define nDPR_MSG_SIZE_UV    nRESLEN_2                         // # of MRF's to hold U/V block data (2)
+#endif
+#if defined(NV11_LOAD_5x8)
+        #define nUV_NUM_OF_ROWS     8                                 // Number of U/V rows per block
+        #define nDPR_BLOCK_SIZE_UV  nBLOCK_WIDTH_12+nBLOCK_HEIGHT_8   // U/V block size 12x8
+        #define nDPR_MSG_SIZE_UV    nRESLEN_4                         // # of MRF's to hold U/V block data (4)
+#endif
+#if defined(NV12_LOAD_8x4)
+        #define nUV_NUM_OF_ROWS     4                                 // Number of U/V rows per block
+        #define nDPR_BLOCK_SIZE_UV  nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4   // U/V block size 16x4
+        #define nDPR_MSG_SIZE_UV    nRESLEN_2                         // # of MRF's to hold U/V block data (2)
+#endif
+#if defined(NV12_LOAD_8x5)
+        #define nUV_NUM_OF_ROWS     6                                 // Number of U/V rows per block (Rounded Up to Nearest Even Number)
+        #define nDPR_BLOCK_SIZE_UV  nBLOCK_WIDTH_16+nBLOCK_HEIGHT_5   // U/V block size 16x5
+        #define nDPR_MSG_SIZE_UV    nRESLEN_3                         // # of MRF's to hold U/V block data (3)
+#endif
+#if defined(NV12_LOAD_9x5)
+        #define nUV_NUM_OF_ROWS     6                                 // Number of U/V rows per block (Rounded Up to Nearest Even Number)
+        #define nDPR_BLOCK_SIZE_UV  nBLOCK_WIDTH_20+nBLOCK_HEIGHT_5   // U/V block size 20x5
+        #define nDPR_MSG_SIZE_UV    nRESLEN_5                         // # of MRF's to hold U/V block data (5)
+#endif
+#if defined(P208_LOAD_8x8)
+        #define nUV_NUM_OF_ROWS     8                                 // Number of U/V rows per block
+        #define nDPR_BLOCK_SIZE_UV  nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8   // U/V block size 16x8
+        #define nDPR_MSG_SIZE_UV    nRESLEN_4                         // # of MRF's to hold U/V block data (4)
+#endif
+#if defined(P208_LOAD_9x8)
+        #define nUV_NUM_OF_ROWS     8                                 // Number of U/V rows per block
+        #define nDPR_BLOCK_SIZE_UV  nBLOCK_WIDTH_20+nBLOCK_HEIGHT_8   // U/V block size 20x8
+        #define nDPR_MSG_SIZE_UV    nRESLEN_8                         // # of MRF's to hold U/V block data (8)
+#endif
+// Source/destination region definitions
+#if !defined(udSRC_Y)
+        #define udSRC_Y  udBOT_Y_IO     // Default Y source region is top Y region
+#endif
+#if !defined(udSRC_U)
+        #define udSRC_U  udBOT_U_IO     // Default U source region is top U region
+#endif
+#define ubSRC_Y           ubBOT_Y
+#define nSRC_Y_REG        nBOT_Y
+#define ubSRC_U           ubBOT_U
+#define nSRC_U_REG        nBOT_U
+#define uwDEST_Y          uwTOP_Y       // However they can be transferred to word-aligned byte if desired
+#define uwDEST_U          uwTOP_U
+#define uwDEST_V          uwTOP_V
+#define nSRC_REGION       nREGION_1     // REGION_1 will be the source region for first kernel
+// End of PL2_Load.inc
--- a/i965_drv_video/shaders/post_processing/Common/PL3_Load.inc
+++ b/i965_drv_video/shaders/post_processing/Common/PL3_Load.inc
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: PL3_Load.inc
+#undef 	nY_NUM_OF_ROWS
+#undef 	nUV_NUM_OF_ROWS
+#define nY_NUM_OF_ROWS      8                                     // Number of Y rows per block
+#define nDPR_BLOCK_SIZE_Y   nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8       // Y block size 16x8
+#define nDPR_MSG_SIZE_Y     nRESLEN_4                             // # of MRF's to hold Y block data (4)
+#if defined(IMC3_LOAD_8x4)
+    #define nUV_NUM_OF_ROWS     4                                 // Number of U/V rows per block
+    #define nDPR_BLOCK_SIZE_UV  nBLOCK_WIDTH_8+nBLOCK_HEIGHT_4    // U/V block size 8x4
+    #define nDPR_MSG_SIZE_UV    nRESLEN_1                         // # of MRF's to hold U/V block data (1)
+#endif
+#if defined(IMC3_LOAD_8x5)
+    #define nUV_NUM_OF_ROWS     6                                 // Number of U/V rows per block (Rounded Up to Nearest Even Number)
+    #define nDPR_BLOCK_SIZE_UV  nBLOCK_WIDTH_8+nBLOCK_HEIGHT_5    // U/V block size 8x5
+    #define nDPR_MSG_SIZE_UV    nRESLEN_2                         // # of MRF's to hold U/V block data (2)
+#endif
+#if defined(IMC3_LOAD_9x5)
+    #define nUV_NUM_OF_ROWS     6                                 // Number of U/V rows per block (Rounded Up to Nearest Even Number)
+    #define nDPR_BLOCK_SIZE_UV  nBLOCK_WIDTH_12+nBLOCK_HEIGHT_5   // U/V block size 12x5
+    #define nDPR_MSG_SIZE_UV    nRESLEN_3                         // # of MRF's to hold U/V block data (3)
+#endif
+// Source/destination region definitions
+#if !defined(udSRC_Y)
+    #define udSRC_Y  udBOT_Y_IO     // Default Y source region is top Y region
+#endif
+#if !defined(udSRC_U)
+    #define udSRC_U  udBOT_U_IO     // Default U source region is top U region
+#endif
+#if !defined(udSRC_V)
+    #define udSRC_V  udBOT_V_IO     // Default V source region is top V region
+#endif
+#define ubSRC_Y      ubBOT_Y        // Loading data are always in byte type
+#define ubSRC_U      ubBOT_U
+#define ubSRC_V      ubBOT_V
+#define uwDEST_Y     uwTOP_Y        // However they can be transferred to word-aligned byte if desired
+#define uwDEST_U     uwTOP_U
+#define uwDEST_V     uwTOP_V
+#define nSRC_REGION  nREGION_1      // REGION_1 will be the source region for first kernel
+// End of PL3_Load.inc
--- a/i965_drv_video/shaders/post_processing/Common/PL4x8_Save_NV11.asm
+++ b/i965_drv_video/shaders/post_processing/Common/PL4x8_Save_NV11.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+#include "PL4x8_Save_NV11.inc"
+    mov  (8) mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+#if !defined(SAVE_UV_ONLY)
+// Save current planar frame Y block data (16x8) -------------------------------
+    mov  (2) mMSGHDR.0<1>:d     wORIX<2;2,1>:w          // Block origin
+    mov  (1) mMSGHDR.2<1>:ud    nDPW_BLOCK_SIZE_Y:ud    // Block width and height (16x8)
+///* Yoni - masking is not relevant for ILK?!? 
+//
+//        //Use the mask to determine which pixels shouldn't be over-written
+//        cmp.ge.f0.0     (1)             NULLREG         BLOCK_MASK_D:ud         0x00FFFFFF:ud   //Check if all pixels in the block need to be modified
+//        (f0.0)  jmpi WritePlanarToDataPort
+//
+//        //If mask is not all 1's, then load the entire 16x8 block
+//        //so that only those bytes may be modified that need to be (using the mask)
+//    send (8)    SRC_YD(0)<1>    MSGHDR  MSGSRC<8;8,1>:ud        DWBRMSGDSC+0x00040000+BI_DEST_Y:ud         //16x8
+//        
+//    asr  (2)    MSGSRC.0<1>:ud  ORIX<2;2,1>:w   1:w     // U/V block origin should be half of Y's
+//    mov  (1)    MSGSRC.2<1>:ud  0x00030007:ud           // Block width and height (8x4)
+//    send (8)    SRC_UD(0)<1>    MSGHDR  MSGSRC<8;8,1>:ud        DWBRMSGDSC+0x00010000+BI_DEST_U:ud
+//    send (8)    SRC_VD(0)<1>    MSGHDR  MSGSRC<8;8,1>:ud        DWBRMSGDSC+0x00010000+BI_DEST_V:ud
+//        
+//    //Restore the origin information
+//    mov (2)     MSGSRC.0<1>:ud  ORIX<2;2,1>:w           // Block origin
+//    mov (1)     MSGSRC.2<1>:ud  0x0007000F:ud           // Block width and height (16x8)
+//
+//        //expand U and V to be aligned on word boundary
+//        mov     (16)    SRC_UW(1)<1>            SRC_U(0,16)
+//        mov     (16)    SRC_UW(0)<1>            SRC_U(0, 0)
+//        mov (16)        SRC_VW(1)<1>            SRC_V(0,16)
+//        mov (16)        SRC_VW(0)<1>            SRC_V(0, 0)
+//        
+//        //Merge the data
+//        mov  (1)        f0.1:uw                 BLOCK_MASK_V:uw                 //Load the mask on flag reg
+//        (f0.1)  mov     (8)     TEMP0<1>:uw     BLOCK_MASK_H:uw
+//        (-f0.1) mov     (8)     TEMP0<1>:uw     0:uw
+//                
+//        // Destination is Word aligned
+//                $for(0; <Y_ROW_SIZE; 2) {
+//                        mov     (1)     f0.1:uw         TEMP(0,%1)<0;1,0>
+//                        (-f0.1) mov  (16)       DEST_Y(0, %1*32)<2>             SRC_Y(0, %1*16)
+//                        (-f0.1) mov  (16)       DEST_U(0, %1*8)<1>              SRC_U(0, %1*8)  //only works for Word aligned Byte data
+//                        (-f0.1) mov  (16)       DEST_V(0, %1*8)<1>              SRC_V(0, %1*8)  //only works for Word aligned Byte data
+//
+//                        mov     (1)     f0.1:uw         TEMP(0,1+%1)<0;1,0>
+//                        (-f0.1) mov  (16)       DEST_Y(0, 1+%1*32)<2>   SRC_Y(0, 1+%1*16)
+//
+//                }
+//
+//*/ Yoni - masking is not relevant for ILK?!? 
+WritePlanarToDataPort:
+    $for(0,0; <nY_NUM_OF_ROWS; 2,1) {
+            mov (16) mubMSGPAYLOAD(%2,0)<1>     ub2DEST_Y(%1)REGION(16,2)
+            mov (16) mubMSGPAYLOAD(%2,16)<1>    ub2DEST_Y(%1+1)REGION(16,2)
+    } 
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_Y+nBI_DESTINATION_Y:ud
+#endif
+// Save U/V data block in planar format (4x8) ----------------------------------
+    mov (2)  rMSGSRC.0<1>:d    wORIX<2;2,1>:w             // Block origin
+    asr (1)  rMSGSRC.0<1>:d    rMSGSRC.0<0;1,0>:d    1:w  // U/V block origin should be half of Y's
+    mov (8)  mMSGHDR<1>:ud     rMSGSRC<8;8,1>:ud
+    $for(0,0; <nY_NUM_OF_ROWS;4,1) {
+        mov (16) mubMSGPAYLOAD(%2,0)<2>     ub2DEST_U(%2)REGION(16,2) 
+        mov (16) mubMSGPAYLOAD(%2,1)<2>     ub2DEST_V(%2)REGION(16,2) 
+    }
+    send (8)    dNULLREG    mMSGHDR    udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_UV+nBI_DESTINATION_UV:ud
+// End of PL4x8_Save_NV11
--- a/i965_drv_video/shaders/post_processing/Common/PL4x8_Save_NV11.inc
+++ b/i965_drv_video/shaders/post_processing/Common/PL4x8_Save_NV11.inc
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//Module name: PL8x4_Save_NV11.inc
+//
+// Setup for storing planar data
+//
+#include "undefall.inc"                 //Undefine the SRC and DEST sysmbols
+#define nDPW_BLOCK_SIZE_Y   nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8   // Y block size 16x8
+#define nDPW_MSG_SIZE_Y     nMSGLEN_4                         // # of MRF's to hold Y block data (4)
+#define nDPW_BLOCK_SIZE_UV  nBLOCK_WIDTH_8+nBLOCK_HEIGHT_8    // U/V interleaved block width and height (8x8)
+#define nDPW_MSG_SIZE_UV    nMSGLEN_2                         // # of MRF's to hold U/V block data (2)
+#if (nSRC_REGION==nREGION_1)
+        #define udSRC_Y                 udBOT_Y_IO
+        #define udSRC_U                 udBOT_U_IO
+        #define udSRC_V                 udBOT_V_IO
+        #define ubSRC_Y                 ubBOT_Y
+        #define ubSRC_U                 ubBOT_U
+        #define ubSRC_V                 ubBOT_V
+        #define uwSRC_U                 uwBOT_U  //For masking operation
+        #define uwSRC_V                 uwBOT_V
+        #define ub2DEST_Y               ub2TOP_Y
+        #define ub2DEST_U               ub2TOP_U
+        #define ub2DEST_V               ub2TOP_V
+#elif (nSRC_REGION==nREGION_2)
+        #define udSRC_Y                 udTOP_Y_IO
+        #define udSRC_U                 udTOP_U_IO
+        #define udSRC_V                 udTOP_V_IO
+        #define ubSRC_Y                 ubTOP_Y
+        #define ubSRC_U                 ubTOP_U
+        #define ubSRC_V                 ubTOP_V
+        #define uwSRC_U                 uwTOP_U  //For masking operation
+        #define uwSRC_V                 uwTOP_V
+        #define ub2DEST_Y               ub2BOT_Y
+        #define ub2DEST_U               ub2BOT_U
+        #define ub2DEST_V               ub2BOT_V
+#endif
+///* Yoni - masking is not relevant for ILK?!? 
+//#define         TEMP0   REG(r,54)
+//.declare    TEMP        Base=TEMP0      ElementSize=2   SrcRegion=<8;8,1>       Type=uw
+///* Yoni - masking is not relevant for ILK?!? 
--- a/i965_drv_video/shaders/post_processing/Common/PL5x8_PL16x8.asm
+++ b/i965_drv_video/shaders/post_processing/Common/PL5x8_PL16x8.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: PL5x8_PL16x8.asm
+#include "Expansion.inc"
+//------------------------------ Horizontal Upconversion -----------------------------
+    $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
+        avg.sat (16) uwDEST_U(0, %1*32+16)    uwDEST_U(0, %1*16+7)<1;2,0>    uwDEST_U(0, %1*16+7)<1;2,1>
+        avg.sat (16) uwDEST_V(0, %1*32+16)    uwDEST_V(0, %1*16+7)<1;2,0>    uwDEST_V(0, %1*16+7)<1;2,1>
+        avg.sat (16) uwDEST_U(0, %1*32)    uwDEST_U(0, %1*16)<1;2,0>    uwDEST_U(0, %1*16)<1;2,1>
+        avg.sat (16) uwDEST_V(0, %1*32)    uwDEST_V(0, %1*16)<1;2,0>    uwDEST_V(0, %1*16)<1;2,1>
+    }
+    $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
+        avg.sat (16) uwDEST_U(0, %1*32+16) uwDEST_U(0, %1*32+18)<1;2,0> uwDEST_U(0, %1*32+18)<1;2,1>
+        avg.sat (16) uwDEST_V(0, %1*32+16) uwDEST_V(0, %1*32+18)<1;2,0> uwDEST_V(0, %1*32+18)<1;2,1>
+        avg.sat (16) uwDEST_U(0, %1*32) uwDEST_U(0, %1*32)<1;2,0> uwDEST_U(0, %1*32)<1;2,1>
+        avg.sat (16) uwDEST_V(0, %1*32) uwDEST_V(0, %1*32)<1;2,0> uwDEST_V(0, %1*32)<1;2,1>
+    }
+// End of PL5x8_PL16x8
--- a/i965_drv_video/shaders/post_processing/Common/PL5x8_PL8x8.asm
+++ b/i965_drv_video/shaders/post_processing/Common/PL5x8_PL8x8.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: PL5x8_PL8x8.asm
+#include "Expansion.inc"
+//------------------------------ Horizontal Upconversion -----------------------------
+    $for (0; <nUV_NUM_OF_ROWS; 1) {
+        avg.sat (8) uwDEST_U(0, %1*8)    uwDEST_U(0, %1*8)<1;2,0>    uwDEST_U(0, %1*8)<1;2,1>
+        avg.sat (8) uwDEST_V(0, %1*8)    uwDEST_V(0, %1*8)<1;2,0>    uwDEST_V(0, %1*8)<1;2,1>
+    }
+// End of PL5x8_PL8x8
--- a/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_IMC3.asm
+++ b/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_IMC3.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: PL8x4_Save_IMC3.asm
+//
+// Save planar YUV420 frame data block of size 16x8
+#include "PL8x4_Save_IMC3.inc"
+//Use the mask to determine which pixels shouldn't be over-written
+    and (1)        acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud   0x00FFFFFF:ud
+    cmp.ge.f0.0(1) dNULLREG     acc0.0<0;1,0>:ud         0x00FFFFFF:ud   //Check if all pixels in the block need to be modified
+    (f0.0)  jmpi WritePlanarToDataPort
+    //If mask is not all 1's, then load the entire 16x8 block
+    //so that only those bytes may be modified that need to be (using the mask)
+    // Load 16x8 planar Y ----------------------------------------------------------
+    mov  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w          // Block origin
+    mov  (1) rMSGSRC.2<1>:ud    nDPW_BLOCK_SIZE_Y:ud    // Block width and height (16x8)
+    mov  (8) mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+    send (8) udSRC_Y(0)<1>      mMSGHDR     udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_DESTINATION_Y:ud
+    // Load 8x4 planar U and V -----------------------------------------------------
+    asr  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    1:w   // U/V block origin should be half of Y's
+    mov  (1) rMSGSRC.2<1>:ud    nDPW_BLOCK_SIZE_UV:ud   // Block width and height (8x4)
+    mov  (8) mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+    send (8) udSRC_U(0)<1>      mMSGHDR     udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_DESTINATION_U:ud
+    mov  (8) mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+    send (8) udSRC_V(0)<1>      mMSGHDR     udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_DESTINATION_V:ud
+    //expand U and V to be aligned on word boundary - Y remains in bytes
+    $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
+        mov (16)  uwSRC_U(0, %1*16)<1>    ubSRC_U(0, %1*16)
+        mov (16)  uwSRC_V(0, %1*16)<1>    ubSRC_V(0, %1*16)
+    }
+    //Merge the data
+    mov (1)           f0.0:uw             ubBLOCK_MASK_V:ub    //Load the mask on flag reg
+    (f0.0)  mov (8)   rMASK_TEMP<1>:uw    uwBLOCK_MASK_H:uw
+    (-f0.0) mov (8)   rMASK_TEMP<1>:uw    0:uw
+    // Destination is Word aligned
+    $for(0; <nY_NUM_OF_ROWS; 2) {
+        mov (1)             f0.1:uw                   uwMASK_TEMP(0,%1)<0;1,0>
+        (-f0.1) mov (16)    ub2DEST_Y(0, %1*32)<2>    ubSRC_Y(0, %1*16)
+        (-f0.1) mov (16)    ub2DEST_U(0, %1*8)<1>     ubSRC_U(0, %1*8)    //only works for Word aligned Byte data
+        (-f0.1) mov (16)    ub2DEST_V(0, %1*8)<1>     ubSRC_V(0, %1*8)    //only works for Word aligned Byte data
+        mov (1)             f0.1:uw                   uwMASK_TEMP(0,1+%1)<0;1,0>
+        (-f0.1) mov (16)    ub2DEST_Y(0, 1+%1*32)<2>  ubSRC_Y(0, 1+%1*16)
+    }
+WritePlanarToDataPort:
+#if !defined(SAVE_UV_ONLY)
+// Save current planar frame Y block data (16x8) -------------------------------
+    mov (2)     rMSGSRC.0<1>:d     wORIX<2;2,1>:w          // Block origin
+    mov (1)     rMSGSRC.2<1>:ud    nDPW_BLOCK_SIZE_Y:ud    // Block width and height (16x8)
+    mov (8)     mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+    $for(0,0; <nY_NUM_OF_ROWS; 2,1) {
+        mov(16) mubMSGPAYLOAD(%2,0)<1>     ub2DEST_Y(%1)REGION(16,2)
+        mov(16) mubMSGPAYLOAD(%2,16)<1>    ub2DEST_Y(%1+1)REGION(16,2)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_Y+nBI_DESTINATION_Y:ud
+#endif
+// Save U/V data block in planar format (8x4) ----------------------------------
+    asr  (2)    rMSGSRC.0<1>:d     wORIX<2;2,1>:w    1:w   // U/V block origin should be half of Y's
+    mov  (1)    rMSGSRC.2<1>:ud    nDPW_BLOCK_SIZE_UV:ud   // Block width and height (8x4)
+    mov  (8)    mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+// Save U picture data ---------------------------------------------------------
+    mov (16)    mubMSGPAYLOAD(0,0)<1>      ub2DEST_U(0)REGION(16,2)   // U rows 0,1
+    mov (16)    mubMSGPAYLOAD(0,16)<1>     ub2DEST_U(1)REGION(16,2)   // U rows 2,3
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_UV+nBI_DESTINATION_U:ud
+    mov  (8)    mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+// Save V picture data ---------------------------------------------------------
+    mov  (16)   mubMSGPAYLOAD(0,0)<1>      ub2DEST_V(0)REGION(16,2)   // V rows 0,1
+    mov  (16)   mubMSGPAYLOAD(0,16)<1>     ub2DEST_V(1)REGION(16,2)   // V rows 2,3
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_UV+nBI_DESTINATION_V:ud
+// End of PL8x4_Save_IMC3
--- a/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_IMC3.inc
+++ b/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_IMC3.inc
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: PL8x4_Save_IMC3.inc
+//
+// Setup for storing planar data
+//
+#include "undefall.inc"                 //Undefine the SRC and DEST sysmbols
+// For saving
+#define nDPW_BLOCK_SIZE_Y        nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8    // Y block size 16x8
+#define nDPW_MSG_SIZE_Y          nMSGLEN_4                          // # of MRF's to hold Y block data (4)
+#define nDPW_BLOCK_SIZE_UV       nBLOCK_WIDTH_8+nBLOCK_HEIGHT_4     // U/V block size 8x4
+#define nDPW_MSG_SIZE_UV         nMSGLEN_1                          // # of MRF's to hold U/V block data (1)
+// For masking
+#undef  nDPR_MSG_SIZE_Y
+#define nDPR_MSG_SIZE_Y      nRESLEN_4        // # of MRF's to hold Y block data (4)
+#undef  nDPR_MSG_SIZE_UV
+#define nDPR_MSG_SIZE_UV     nRESLEN_1        // # of MRF's to hold U/V block data (1)
+#define     rMASK_TEMP     REG(r,nTEMP0)
+.declare    uwMASK_TEMP    Base=rMASK_TEMP    ElementSize=2    SrcRegion=<8;8,1>    Type=uw        //1 GRF
+#if (nSRC_REGION==nREGION_1)
+    // For saving
+    #define ub2DEST_Y        ub2TOP_Y
+    #define ub2DEST_U        ub2TOP_U
+    #define ub2DEST_V        ub2TOP_V
+    //For masking operation
+    #define udSRC_Y          udBOT_Y_IO
+    #define udSRC_U          udBOT_U_IO
+    #define udSRC_V          udBOT_V_IO
+    #define ubSRC_Y          ubBOT_Y
+    #define ubSRC_U          ubBOT_U
+    #define ubSRC_V          ubBOT_V
+    #define uwSRC_U          uwBOT_U
+    #define uwSRC_V          uwBOT_V
+#elif (nSRC_REGION==nREGION_2)
+    // For saving
+    #define ub2DEST_Y        ub2BOT_Y
+    #define ub2DEST_U        ub2BOT_U
+    #define ub2DEST_V        ub2BOT_V
+    //For masking operation
+    #define udSRC_Y          udTOP_Y_IO
+    #define udSRC_U          udTOP_U_IO
+    #define udSRC_V          udTOP_V_IO
+    #define ubSRC_Y          ubTOP_Y
+    #define ubSRC_U          ubTOP_U
+    #define ubSRC_V          ubTOP_V
+    #define uwSRC_U          uwTOP_U
+    #define uwSRC_V          uwTOP_V
+#endif
--- a/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_NV12.asm
+++ b/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_NV12.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: PL8x4_Save_NV12.asm
+//
+// Save entire current planar frame data block of size 16x8
+//---------------------------------------------------------------
+//  Symbols needed to be defined before including this module
+//
+//      DWORD_ALIGNED_DEST:     only if DEST_Y, DEST_U, DEST_V data are DWord aligned
+//      ORIX:
+//---------------------------------------------------------------
+#include "PL8x4_Save_NV12.inc"
+    mov  (8) mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+#if !defined(SAVE_UV_ONLY)
+// Save current planar frame Y block data (16x8) -------------------------------
+    mov  (2) mMSGHDR.0<1>:d     wORIX<2;2,1>:w          // Block origin
+    mov  (1) mMSGHDR.2<1>:ud    nDPW_BLOCK_SIZE_Y:ud    // Block width and height (16x8)
+#endif
+//Use the mask to determine which pixels shouldn't be over-written
+	and	(1)		acc0<1>:ud		udBLOCK_MASK<0;1,0>:ud		0x00FFFFFF:ud
+	cmp.ge.f0.0	(1)		dNULLREG		acc0<0;1,0>:ud		0x00FFFFFF:ud	//Check if all pixels in the block need to be modified
+	(f0.0)	jmpi WritePlanarToDataPort
+//If mask is not all 1's, then load the entire 16x8 block
+//so that only those bytes may be modified that need to be (using the mask)	
+  send (8)	udSRC_Y(0)<1>	mMSGHDR	udDUMMY_NULL nDATAPORT_READ	nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_DESTINATION_Y:ud		//16x8  
+  asr  (1)	rMSGSRC.1<1>:ud	wORIY<0;1,0>:w	1:w	{ NoDDClr }	// U/V block origin should be half of Y's
+  mov  (1)	rMSGSRC.2<1>:ud	nDPW_BLOCK_SIZE_UV:ud		{ NoDDChk }	// Block width and height (16x4)
+  mov (8)  mMSGHDR<1>:ud       rMSGSRC<8;8,1>:ud //move message desrcptor to the message header
+  send (8)	udSRC_U(0)<1>	mMSGHDR	udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_DESTINATION_UV:ud     	                                                   
+//Restore the origin information
+  mov (2)	rMSGSRC.0<1>:ud	wORIX<2;2,1>:w		// Block origin
+  mov (1)	rMSGSRC.2<1>:ud	nDPW_BLOCK_SIZE_Y:ud		// Block width and height (16x8)
+  mov (8) mMSGHDR<1>:ud       rMSGSRC<8;8,1>:ud //move message desrcptor to the message header	
+//Merge the data
+	mov  (1)	f0.1:uw			ubBLOCK_MASK_V:ub			//Load the mask on flag reg
+	(f0.1)	mov	(8)	rMASK_TEMP<1>:uw	uwBLOCK_MASK_H:uw
+	(-f0.1)	mov	(8)	rMASK_TEMP<1>:uw	0:uw  
+//convert the mask from 16bits to 8bits by selecting every other bit
+	mov (1) udMASK_TEMP1(0,0)<1> 0x00040001:ud 
+	mov (1) udMASK_TEMP1(0,1)<1> 0x00400010:ud
+	mov (1) udMASK_TEMP1(0,2)<1> 0x04000100:ud
+	mov (1) udMASK_TEMP1(0,3)<1> 0x40001000:ud
+//merge the loaded block with the current block
+  $for(0,0; <nY_NUM_OF_ROWS; 2,1) {
+  	mov	(1)	f0.1:uw		uwMASK_TEMP(0, %1)<0;1,0>
+		(-f0.1)	mov  (16)	ubDEST_Y(0,%1*32)<2>		ubSRC_Y(0,%1*16)		
+	  and.nz.f0.1 (8) wNULLREG uwMASK_TEMP(0,%1)<0;1,0> uwMASK_TEMP1(0,0) //change the mask by selecting every other bit
+		(-f0.1)	mov  (8)	ubDEST_U(0, %2*16)<2>		ub2SRC_U(0, %1*8)<16;8,2>
+		(-f0.1)	mov  (8)	ubDEST_V(0, %2*16)<2>		ub2SRC_U(0, %1*8+1)<16;8,2>
+		mov	(1)	f0.1:uw		uwMASK_TEMP(0,1+%1)<0;1,0>
+		(-f0.1)	mov  (16)	ubDEST_Y(0, (1+%1)*32)<2>	ubSRC_Y(0, (1+%1)*16)		
+  }	 
+WritePlanarToDataPort:
+#if !defined(SAVE_UV_ONLY)
+    $for(0,0; <nY_NUM_OF_ROWS; 2,1) {
+            mov (16) mubMSGPAYLOAD(%2,0)<1>     ub2DEST_Y(%1)REGION(16,2)
+            mov (16) mubMSGPAYLOAD(%2,16)<1>    ub2DEST_Y(%1+1)REGION(16,2)
+    } 
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_Y+nBI_DESTINATION_Y:ud
+#endif
+//** Save  8x4 packed U and V -----------------------------------------------------
+// we could write directly wORIX to mMSGHDR and then execute asr on it, that way we could
+// avoid using rMSGSRC as a buffer and have one command less in code, but it is unknown whether
+//it is possible to do asr on mMSGHDR so we use rMSGSRC.
+    mov (2)  rMSGSRC.0<1>:d    wORIX<2;2,1>:w             // Block origin
+    asr (1)  rMSGSRC.1<1>:d    rMSGSRC.1<0;1,0>:d    1:w  // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud   nDPW_BLOCK_SIZE_UV:ud      // U/V block width and height (16x4)
+    mov (8)  mMSGHDR<1>:ud     rMSGSRC<8;8,1>:ud
+    $for(0,0; <nY_NUM_OF_ROWS;4,1) {
+        mov (16) mubMSGPAYLOAD(%2,0)<2>     ub2DEST_U(%2)REGION(16,2) 
+        mov (16) mubMSGPAYLOAD(%2,1)<2>     ub2DEST_V(%2)REGION(16,2) 
+    }
+    send (8)    dNULLREG    mMSGHDR    udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_UV+nBI_DESTINATION_UV:ud
+// End of PL8x4_Save_NV12  
--- a/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_NV12.inc
+++ b/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_NV12.inc
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//Module name: PL8x4_Save_NV12.inc
+//
+// Setup for storing planar data
+//
+#include "undefall.inc"                 //Undefine the SRC and DEST sysmbols
+#undef nDPW_BLOCK_SIZE_Y
+#undef nDPW_MSG_SIZE_Y
+#undef nDPW_BLOCK_SIZE_UV
+#undef nDPW_MSG_SIZE_UV
+#define nDPW_BLOCK_SIZE_Y   nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8   // Y block size 16x8
+#define nDPW_MSG_SIZE_Y     nMSGLEN_4                         // # of MRF's to hold Y block data (4)
+#define nDPW_BLOCK_SIZE_UV  nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4    // U/V interleaved block width and height (16x4)
+#define nDPW_MSG_SIZE_UV    nMSGLEN_2                         // # of MRF's to hold U/V block data (2)
+// For masking
+#undef  nDPR_MSG_SIZE_Y
+#define nDPR_MSG_SIZE_Y        nRESLEN_4                          // # of MRF's to hold Y block data (4)
+#undef  nDPR_MSG_SIZE_UV
+#define nDPR_MSG_SIZE_UV       nRESLEN_2  
+#define     rMASK_TEMP     REG(r,nTEMP0)
+.declare    uwMASK_TEMP    Base=rMASK_TEMP    ElementSize=2    SrcRegion=<8;8,1>    Type=uw        //1 GRF
+#define     rMASK_TEMP1     REG(r,nTEMP1)
+.declare    udMASK_TEMP1    Base=rMASK_TEMP1    ElementSize=4    SrcRegion=<4;4,1>    Type=ud        //1 GRF
+.declare    uwMASK_TEMP1    Base=rMASK_TEMP1    ElementSize=2    SrcRegion=<8;8,1>    Type=uw        //1 GRF
+#if (nSRC_REGION==nREGION_1)
+        #define udSRC_Y                 udBOT_Y_IO
+        #define udSRC_U                 udBOT_U_IO
+        #define udSRC_V                 udBOT_V_IO
+        #define ubSRC_Y                 ubBOT_Y
+        #define ubSRC_U                 ubBOT_U
+        #define ubSRC_V                 ubBOT_V
+        #define uwSRC_U                 uwBOT_U  //For masking operation
+        #define uwSRC_V                 uwBOT_V
+        #define ub2DEST_Y               ub2TOP_Y
+        #define ub2DEST_U               ub2TOP_U
+        #define ub2DEST_V               ub2TOP_V
+	      #define ubDEST_Y		            ubTOP_Y	
+      	#define ubDEST_U		            ubTOP_U	
+        #define ubDEST_V		            ubTOP_V
+        #define ub2SRC_U			          ub2BOT_U
+#elif (nSRC_REGION==nREGION_2)
+        #define udSRC_Y                 udTOP_Y_IO
+        #define udSRC_U                 udTOP_U_IO
+        #define udSRC_V                 udTOP_V_IO
+        #define ubSRC_Y                 ubTOP_Y
+        #define ubSRC_U                 ubTOP_U
+        #define ubSRC_V                 ubTOP_V
+        #define uwSRC_U                 uwTOP_U  //For masking operation
+        #define uwSRC_V                 uwTOP_V
+        #define ub2DEST_Y               ub2BOT_Y
+        #define ub2DEST_U               ub2BOT_U
+        #define ub2DEST_V               ub2BOT_V
+	      #define ubDEST_Y		            ubBOT_Y	
+      	#define ubDEST_U		            ubBOT_U	
+        #define ubDEST_V		            ubBOT_V        
+        #define ub2SRC_U			          ub2TOP_U
+#endif
+///* Yoni - masking is not relevant for ILK?!? 
+//#define         TEMP0   REG(r,54)
+//.declare    TEMP        Base=TEMP0      ElementSize=2   SrcRegion=<8;8,1>       Type=uw
+///* Yoni - masking is not relevant for ILK?!? 
--- a/i965_drv_video/shaders/post_processing/Common/PL8x5_PL8x8.asm
+++ b/i965_drv_video/shaders/post_processing/Common/PL8x5_PL8x8.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: PL8x5_PL8x8.asm
+#include "Expansion.inc"
+//------------------------------- Vertical Upconversion ------------------------------
+    avg.sat (8) uwDEST_U(0, 3*16+8)<1>   uwDEST_U(0, 3*8)    uwDEST_U(0, (1+3)*8)    // Optimization
+    avg.sat (8) uwDEST_V(0, 3*16+8)<1>   uwDEST_V(0, 3*8)    uwDEST_V(0, (1+3)*8)    // Optimization
+    $for(nUV_NUM_OF_ROWS/2-2; >-1; -1) {
+        mov     (8) uwDEST_U(0, (1+%1)*16)<1>    uwDEST_U(0, (1+%1)*8)
+        avg.sat (8) uwDEST_U(0, %1*16+8)<1>   uwDEST_U(0, %1*8)    uwDEST_U(0, (1+%1)*8)
+        mov     (8) uwDEST_V(0, (1+%1)*16)<1>    uwDEST_V(0, (1+%1)*8)
+        avg.sat (8) uwDEST_V(0, %1*16+8)<1>   uwDEST_V(0, %1*8)    uwDEST_V(0, (1+%1)*8)
+    }
+// End of PL8x5_PL8x8
--- a/i965_drv_video/shaders/post_processing/Common/PL8x8_PL8x4.asm
+++ b/i965_drv_video/shaders/post_processing/Common/PL8x8_PL8x4.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: PL8x8_PL8x4.asm
+//
+// Convert PL 8x8 to PL8x4 in GRF
+//---------------------------------------------------------------
+//  Symbols needed to be defined before including this module
+//
+//	DWORD_ALIGNED_DEST:	only if DEST_Y, DEST_U, DEST_V data are DWord aligned
+//	ORIX:
+//---------------------------------------------------------------
+#include "PL8x8_PL8x4.inc"
+// Convert PL8x8 to PL8x4 ---------------------------------------------------------
+  mov (8) ubDEST_U(0,16)<2> ubDEST_U(1)<16;8,2> //selecting U every other row
+  mov (16) ubDEST_U(0,32)<2> ubDEST_U(2)<32;8,2> //selecting U every other row
+  mov (8) ubDEST_V(0,16)<2> ubDEST_V(1)<16;8,2> //selecting V every other row
+  mov (16) ubDEST_V(0,32)<2> ubDEST_V(2)<32;8,2> //selecting V every other row
+// End of PL8x8_PL8x4.asm -------------------------------------------------------
\ No newline at end of file
--- a/i965_drv_video/shaders/post_processing/Common/PL8x8_PL8x4.inc
+++ b/i965_drv_video/shaders/post_processing/Common/PL8x8_PL8x4.inc
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: PL8x8_PL8x4.inc
+//
+// Setup module for convert PL8x8 to PL8x4 
+//
+// 
+// Source/destination region definitions
+//
+#include "undefall.inc"				//Undefine the SRC and DEST sysmbols
+#if (nSRC_REGION==nREGION_1)
+	//REGION_1 selected
+	#define ubDEST_Y		ubTOP_Y		
+	#define ubDEST_U		ubTOP_U	
+  #define ubDEST_V		ubTOP_V
+#elif (nSRC_REGION==nREGION_2)
+	//REGION_2 selected	
+	#define ubDEST_Y		ubBOT_Y	
+	#define ubDEST_U		ubBOT_U	
+	#define ubDEST_V		ubBOT_V
+#endif
--- a/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_P208.asm
+++ b/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_P208.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: PL8x8_Save_P208.asm
+//
+// Save entire current planar frame data block of size 16x8
+//---------------------------------------------------------------
+//  Symbols needed to be defined before including this module
+//
+//      DWORD_ALIGNED_DEST:     only if DEST_Y, DEST_U, DEST_V data are DWord aligned
+//      ORIX:
+//---------------------------------------------------------------
+#include "PL8x8_Save_P208.inc"
+    mov  (8) mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+#if !defined(SAVE_UV_ONLY)
+// Save current planar frame Y block data (16x8) -------------------------------
+    mov  (2) mMSGHDR.0<1>:d     wORIX<2;2,1>:w          // Block origin
+    mov  (1) mMSGHDR.2<1>:ud    nDPW_BLOCK_SIZE_Y:ud    // Block width and height (16x8)
+WritePlanarToDataPort:
+    $for(0,0; <nY_NUM_OF_ROWS; 2,1) {
+            mov (16) mubMSGPAYLOAD(%2,0)<1>     ub2DEST_Y(%1)REGION(16,2)
+            mov (16) mubMSGPAYLOAD(%2,16)<1>    ub2DEST_Y(%1+1)REGION(16,2)
+    } 
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_Y+nBI_DESTINATION_Y:ud
+#endif
+//** Save  8x8 packed U and V -----------------------------------------------------
+// we could write directly wORIX to mMSGHDR and then execute asr on it, that way we could
+// avoid using rMSGSRC as a buffer and have one command less in code, but it is unknown whether
+//it is possible to do asr on mMSGHDR so we use rMSGSRC.
+    mov (2)  rMSGSRC.0<1>:d    wORIX<2;2,1>:w             // Block origin
+    mov (1)  rMSGSRC.2<1>:ud   nDPW_BLOCK_SIZE_UV:ud      // U/V block width and height (16x4)
+    mov (8)  mMSGHDR<1>:ud     rMSGSRC<8;8,1>:ud
+    $for(0,0; <nY_NUM_OF_ROWS;2,1) {
+        mov (16) mubMSGPAYLOAD(%2,0)<2>     ub2DEST_U(%2)REGION(16,2) 
+        mov (16) mubMSGPAYLOAD(%2,1)<2>     ub2DEST_V(%2)REGION(16,2) 
+    }
+    send (8)    dNULLREG    mMSGHDR    udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_UV+nBI_DESTINATION_UV:ud
+//End of PL8x8_Save_P208.asm  
--- a/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_P208.inc
+++ b/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_P208.inc
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//Module name: PL8x8_Save_P208.inc
+//
+// Setup for storing planar data
+//
+#include "undefall.inc"                 //Undefine the SRC and DEST sysmbols
+#define nDPW_BLOCK_SIZE_Y   nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8   // Y block size 16x8
+#define nDPW_MSG_SIZE_Y     nMSGLEN_4                         // # of MRF's to hold Y block data (4)
+#define nDPW_BLOCK_SIZE_UV  nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8    // U/V interleaved block width and height (16x8)
+#define nDPW_MSG_SIZE_UV    nMSGLEN_4                         // # of MRF's to hold U/V block data (4)
+#if (nSRC_REGION==nREGION_1)
+        #define udSRC_Y                 udBOT_Y_IO
+        #define udSRC_U                 udBOT_U_IO
+        #define udSRC_V                 udBOT_V_IO
+        #define ubSRC_Y                 ubBOT_Y
+        #define ubSRC_U                 ubBOT_U
+        #define ubSRC_V                 ubBOT_V
+        #define uwSRC_U                 uwBOT_U  //For masking operation
+        #define uwSRC_V                 uwBOT_V
+        #define ub2DEST_Y               ub2TOP_Y
+        #define ub2DEST_U               ub2TOP_U
+        #define ub2DEST_V               ub2TOP_V
+#elif (nSRC_REGION==nREGION_2)
+        #define udSRC_Y                 udTOP_Y_IO
+        #define udSRC_U                 udTOP_U_IO
+        #define udSRC_V                 udTOP_V_IO
+        #define ubSRC_Y                 ubTOP_Y
+        #define ubSRC_U                 ubTOP_U
+        #define ubSRC_V                 ubTOP_V
+        #define uwSRC_U                 uwTOP_U  //For masking operation
+        #define uwSRC_V                 uwTOP_V
+        #define ub2DEST_Y               ub2BOT_Y
+        #define ub2DEST_U               ub2BOT_U
+        #define ub2DEST_V               ub2BOT_V
+#endif
+///* Yoni - masking is not relevant for ILK?!? 
+//#define         TEMP0   REG(r,54)
+//.declare    TEMP        Base=TEMP0      ElementSize=2   SrcRegion=<8;8,1>       Type=uw
+///* Yoni - masking is not relevant for ILK?!? 
--- a/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_PA.asm
+++ b/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_PA.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: PL8x8_Save_PA.asm
+//
+// Save planar YUV422 to packed YUV422 format data
+//
+// Note: SRC_* must reference to regions with data type "BYTE"
+//               in order to save to byte-aligned byte location
+#include "PL8x8_Save_PA.inc"
+    add (4) pCF_Y_OFFSET<1>:uw   ubDEST_CF_OFFSET<4;4,1>:ub   nDEST_YUV_REG*nGRFWIB:w    // Initial Y,U,V offset in YUV422 block
+    // Pack Y
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (16) r[pCF_Y_OFFSET, %1*nGRFWIB]<2>    ubSRC_Y(0,%1*32)
+    }
+    // Pack U/V
+    $for(0; <nUV_NUM_OF_ROWS; 1) {
+        mov (8)  r[pCF_U_OFFSET, %1*nGRFWIB]<4>    ubSRC_U(0, %1*16)
+        mov (8)  r[pCF_V_OFFSET, %1*nGRFWIB]<4>    ubSRC_V(0, %1*16)
+    }
+    shl (1) rMSGSRC.0<1>:d      wORIX<0;1,0>:w            1:w  { NoDDClr }             // H. block origin need to be doubled
+    mov (1) rMSGSRC.1<1>:d      wORIY<0;1,0>:w                 { NoDDClr, NoDDChk }    // Block origin
+    mov (1) rMSGSRC.2<1>:ud     nDPW_BLOCK_SIZE_YUV:ud         { NoDDChk }             // Block width and height (32x8)
+    mov (8) mMSGHDR<1>:ud       rMSGSRC<8;8,1>:ud
+//Use the mask to determine which pixels shouldn't be over-written
+    and (1)        acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud   0x00FFFFFF:ud
+    cmp.ge.f0.0(1) dNULLREG     acc0.0<0;1,0>:ud         0x00FFFFFF:ud   //Check if all pixels in the block need to be modified
+    (f0.0)  jmpi WritePackedToDataPort
+    //If mask is not all 1's, then load the entire 32x8 block
+    //so that only those bytes may be modified that need to be (using the mask)
+    // Load 32x8 packed YUV 422 ----------------------------------------------------
+    send (8) udSRC_YUV(0)<1>    mMSGHDR     udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV+nBI_DESTINATION_YUV:ud
+    mov  (8) mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+    //Merge the data
+    mov (1)           f0.0:uw             ubBLOCK_MASK_V:ub    //Load the mask on flag reg
+    (f0.0)  mov (8)   rMASK_TEMP<1>:uw    uwBLOCK_MASK_H:uw
+    (-f0.0) mov (8)   rMASK_TEMP<1>:uw    0:uw
+    // Destination is Byte aligned
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (1)             f0.1:uw                   uwMASK_TEMP(0,%1)<0;1,0>
+        (-f0.1) mov (16)    uwDEST_YUV(%1)<1>         uwSRC_YUV(%1)        //check the UV merge - vK
+    }
+WritePackedToDataPort:
+    //  Packed YUV data are stored in one of the I/O regions before moving to MRF
+    //  Note: This is necessary since indirect addressing is not supported for MRF. 
+    //  Packed data block should be saved as 32x8 pixel block
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (8) mudMSGPAYLOAD(%1)<1>       udDEST_YUV(%1)REGION(8,1)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_YUV+nBI_DESTINATION_YUV:ud
+// End of PL8x8_Save_PA
--- a/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_PA.inc
+++ b/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_PA.inc
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: PL8x8_Save_PA.inc
+//
+// Setup for storing packed data
+//
+#include "undefall.inc"                 //Undefine the SRC and DEST sysmbols
+// For saving
+#define nDPW_BLOCK_SIZE_YUV      nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8    // YUV block size 32x8
+#define nDPW_MSG_SIZE_YUV        nMSGLEN_8                          // # of MRF's to hold YUV block data (8)
+// For masking
+#undef  nDPR_MSG_SIZE_YUV
+#define nDPR_MSG_SIZE_YUV        nRESLEN_8                          // # of MRF's to hold YUV block data (8)
+#define     rMASK_TEMP     REG(r,nTEMP0)
+.declare    uwMASK_TEMP    Base=rMASK_TEMP    ElementSize=2    SrcRegion=<8;8,1>    Type=uw        //1 GRF
+#if (nSRC_REGION==nREGION_1)
+    // For saving
+    #define udSRC_YUV        udTOP_Y_IO
+    #define udDEST_YUV       udBOT_Y_IO
+    #define nDEST_YUV_REG    nBOT_Y
+    //For masking operation
+    #define ubSRC_Y          ub2TOP_Y
+    #define ubSRC_U          ub2TOP_U
+    #define ubSRC_V          ub2TOP_V
+    #define uwSRC_YUV        uwTOP_Y
+    #define uwDEST_YUV       uwBOT_Y
+#elif (nSRC_REGION==nREGION_2)
+    // For saving
+    #define udSRC_YUV        udBOT_Y_IO
+    #define udDEST_YUV       udTOP_Y_IO
+    #define nDEST_YUV_REG    nTOP_Y
+    //For masking operation
+    #define ubSRC_Y          ub2BOT_Y
+    #define ubSRC_U          ub2BOT_U
+    #define ubSRC_V          ub2BOT_V
+    #define uwSRC_YUV        uwBOT_Y
+    #define uwDEST_YUV       uwTOP_Y
+#endif
--- a/i965_drv_video/shaders/post_processing/Common/PL9x5_PL16x8.asm
+++ b/i965_drv_video/shaders/post_processing/Common/PL9x5_PL16x8.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: PL9x5_PL16x8.asm
+#define EXPAND_9x5
+#include "Expansion.inc"
+//------------------------------ Horizontal Upconversion -----------------------------
+    $for (nUV_NUM_OF_ROWS-2; >-1; -1) {
+        avg.sat (16) uwDEST_U(0, %1*16)<1>    uwDEST_U(0, %1*16)<1;2,0>    uwDEST_U(0, %1*16)<1;2,1>
+        avg.sat (16) uwDEST_V(0, %1*16)<1>    uwDEST_V(0, %1*16)<1;2,0>    uwDEST_V(0, %1*16)<1;2,1>
+    }
+#undef 	nUV_NUM_OF_ROWS
+#define nUV_NUM_OF_ROWS		8	//use packed version of all post-processing kernels
+//------------------------------- Vertical Upconversion ------------------------------
+    avg.sat (16) uwDEST_U(0, 3*32+16)<1>   uwDEST_U(0, 3*16)    uwDEST_U(0, (1+3)*16)
+    avg.sat (16) uwDEST_V(0, 3*32+16)<1>   uwDEST_V(0, 3*16)    uwDEST_V(0, (1+3)*16)
+    $for(nUV_NUM_OF_ROWS/2-2; >-1; -1) {
+        mov     (16) uwDEST_U(0, (1+%1)*32)<1>    uwDEST_U(0, (1+%1)*16)
+        avg.sat (16) uwDEST_U(0, %1*32+16)<1>   uwDEST_U(0, %1*16)    uwDEST_U(0, (1+%1)*16)
+        mov     (16) uwDEST_V(0, (1+%1)*32)<1>    uwDEST_V(0, (1+%1)*16)
+        avg.sat (16) uwDEST_V(0, %1*32+16)<1>   uwDEST_V(0, %1*16)    uwDEST_V(0, (1+%1)*16)
+    }
+// End of PL9x5_PL16x8
--- a/i965_drv_video/shaders/post_processing/Common/PL9x8_PL16x8.asm
+++ b/i965_drv_video/shaders/post_processing/Common/PL9x8_PL16x8.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: PL9x5_PL16x8.asm
+#include "Expansion.inc"
+//------------------------------ Horizontal Upconversion -----------------------------
+    $for (0; <nUV_NUM_OF_ROWS; 1) {
+        avg.sat (16) uwDEST_U(0, %1*16)<1>    uwDEST_U(0, %1*16)<1;2,0>    uwDEST_U(0, %1*16)<1;2,1>
+        avg.sat (16) uwDEST_V(0, %1*16)<1>    uwDEST_V(0, %1*16)<1;2,0>    uwDEST_V(0, %1*16)<1;2,1>
+    }
+// End of PL9x5_PL16x8
\ No newline at end of file
--- a/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB.asm
+++ b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: RGB16x8_Save_RGB.asm
+//
+// Save packed ARGB 444 frame data block of size 16x8
+//
+// To save 16x8 block (64x8 byte layout for ARGB8888) we need 2 send instructions
+//  ---------
+//  | 1 | 2 |
+//  --------- 
+#include "RGB16x8_Save_RGB.inc"
+    shl (1) rMSGSRC.0<1>:d      wORIX<0;1,0>:w            2:w  { NoDDClr }             // H. block origin need to be quadrupled
+    mov (1) rMSGSRC.1<1>:d      wORIY<0;1,0>:w                 { NoDDClr, NoDDChk }    // Block origin (1st quadrant)
+    mov (1) rMSGSRC.2<1>:ud     nDPW_BLOCK_SIZE_ARGB:ud        { NoDDChk }             // Block width and height (32x8)
+    mov (8) mMSGHDR<1>:ud       rMSGSRC<8;8,1>:ud
+//Use the mask to determine which pixels shouldn't be over-written
+    and (1)        acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud   0x00FFFFFF:ud
+    cmp.ge.f0.0(1) dNULLREG     acc0.0<0;1,0>:ud         0x00FFFFFF:ud   //Check if all pixels in the block need to be modified
+    (f0.0)  jmpi WriteARGBToDataPort
+    //If mask is not all 1's, then load the entire 64x8 block
+    //so that only those bytes may be modified that need to be (using the mask)
+    // Load first block 16x8 packed ARGB 444 ---------------------------------------
+    or (1)         acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud   0xFF00FF00:ud   //Check first block
+    cmp.e.f0.0 (1) dNULLREG     acc0.0<0;1,0>:ud         0xFFFFFFFF:ud   
+    (f0.0)  jmpi SkipFirstBlockMerge                                     //If full mask then skip this block
+    send (8) udSRC_ARGB(0)<1>   mMSGHDR     udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
+    mov  (8) mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+    //Merge the data
+    mov (1)           f0.0:uw             ubBLOCK_MASK_V:ub    //Load the mask on flag reg
+    (f0.0)  mov (8)   rMASK_TEMP<1>:uw    uwBLOCK_MASK_H:uw    //use sel instruction - vK
+    (-f0.0) mov (8)   rMASK_TEMP<1>:uw    0:uw
+    $for(0, 0; <nY_NUM_OF_ROWS; 1, 2) {               //take care of the lines in the block, they are different in the src and dest
+        mov (1)             f0.1:uw                   uwMASK_TEMP(0,%1)<0;1,0>
+        (-f0.1) mov (8)     udDEST_ARGB(%2)<1>        udSRC_ARGB(%1) 
+    }
+SkipFirstBlockMerge:
+    // Load second block 16x8 packed ARGB 444 ---------------------------------------
+    or (1)         acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud   0xFF0000FF:ud   //Check second block
+    cmp.e.f0.0 (1) dNULLREG     acc0.0<0;1,0>:ud         0xFFFFFFFF:ud   
+    (f0.0)  jmpi WriteARGBToDataPort                                     //If full mask then skip this block
+    add  (1) mMSGHDR.0<1>:d     rMSGSRC.0<0;1,0>:d       32:d     // Point to 2nd part
+    send (8) udSRC_ARGB(0)<1>   mMSGHDR    udDUMMY_NULL  nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
+    mov  (8) mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud                 // Point to 1st part again
+    //Merge the data
+    mov (1)           f0.0:uw             ubBLOCK_MASK_V:ub    //Load the mask on flag reg
+    (f0.0)  shr (8)   rMASK_TEMP<1>:uw    uwBLOCK_MASK_H:uw    8:uw    //load the mask for second block
+    (-f0.0) mov (8)   rMASK_TEMP<1>:uw    0:uw
+    $for(0, 1; <nY_NUM_OF_ROWS; 1, 2) {               //take care of the lines in the block, they are different in the src and dest
+        mov (1)             f0.1:uw                   uwMASK_TEMP(0,%1)<0;1,0>
+        (-f0.1) mov (8)     udDEST_ARGB(%2)<1>        udSRC_ARGB(%1) 
+    }
+WriteARGBToDataPort:
+    // Move packed data to MRF and output
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (8) mudMSGPAYLOAD(%1)<1>       udDEST_ARGB(%1*2)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
+    mov  (8)    mMSGHDR<1>:ud         rMSGSRC<8;8,1>:ud
+    add  (1)    mMSGHDR.0<1>:d        rMSGSRC.0<0;1,0>:d       32:d   // Point to 2nd part
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (8) mudMSGPAYLOAD(%1)<1>       udDEST_ARGB(%1*2+1)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
+// End of RGB16x8_Save_RGB
--- a/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB.inc
+++ b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB.inc
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: RGB16x8_Save_RGB.inc
+//
+#include "undefall.inc"                 //Undefine the SRC and DEST sysmbols
+// For saving
+#define nDPW_BLOCK_SIZE_ARGB     nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8    // ARGB block size 32x8
+#define nDPW_MSG_SIZE_ARGB       nMSGLEN_8                          // # of MRF's to hold ARGB block data (8)
+// For masking
+#undef  nDPR_MSG_SIZE_ARGB
+#define nDPR_MSG_SIZE_ARGB       nRESLEN_8                          // # of MRF's to hold ARGB block data (8)
+#define     rMASK_TEMP     REG(r,nTEMP0)
+.declare    uwMASK_TEMP    Base=rMASK_TEMP    ElementSize=2    SrcRegion=<8;8,1>    Type=uw        //1 GRF
+#if (nSRC_REGION==nREGION_1)
+    // For saving
+    #define udDEST_ARGB      udTOP_Y_IO    //The output of previous stage is stored here; This is modified and is written to render cache
+    //For masking operation
+    #define udSRC_ARGB       udBOT_Y_IO    //To hold the destination data that shouldn't be modified
+#elif (nSRC_REGION==nREGION_2)
+    // For saving
+    #define udDEST_ARGB      udBOT_Y_IO    //The output of previous stage is stored here; This is modified and is written to render cache
+    //For masking operation
+    #define udSRC_ARGB       udTOP_Y_IO    //To hold the destination data that shouldn't be modified
+#endif
--- a/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB16.asm
+++ b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB16.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: RGB16x8_Save_RGB16.asm
+//
+// Save packed RGB565 frame data block of size 16x8
+//
+// To save 16x8 block (32x8 byte layout for RGB565) we need 1 send instruction
+//  -----
+//  | 1 |
+//  ----- 
+#include "RGB16x8_Save_RGB16.inc"
+//convert 32 bit RGB to 16 bit RGB
+    // Truncate A8R8G8B8 to A6R5G6B5 within byte.
+    // That is keeping 5 MSB of R and B, and 6 MSB of G.
+    $for (0, 0; <nY_NUM_OF_ROWS; 1, 2) {
+        shr     uwCSC_TEMP(%1,0)<1>    ubDEST_ARGB(%2,0)<32;8,4>   3:w                // B >> 3
+        shl (16) uwTEMP_RGB16(0)<1>    uwDEST_ARGB(%2,1)<16;8,2>   8:w                // R << 8
+        and (16) uwTEMP_RGB16(0)<1>    uwTEMP_RGB16(0)             0xF800:uw
+        or  (16) uwCSC_TEMP(%1,0)<1>   uwCSC_TEMP(%1,0)<16;16,1>   uwTEMP_RGB16(0)
+        shr (16) uwTEMP_RGB16(0)<1>    uwDEST_ARGB(%2,0)<16;8,2>   5:w                // G >> 5
+        and (16) uwTEMP_RGB16(0)<1>    uwTEMP_RGB16(0)             0x07E0:uw
+        or  (16) uwCSC_TEMP(%1,0)<1>   uwCSC_TEMP(%1,0)<16;16,1>   uwTEMP_RGB16(0)
+    }
+    mov (2) rMSGSRC.0<1>:d      wORIX<2;2,1>:w                      // Block origin (1st quadrant)
+    shl (1) rMSGSRC.0<1>:d      wORIX<0;1,0>:w              1:w     // H. block origin need to be doubled for byte offset
+    mov (1) rMSGSRC.2<1>:ud     nDPW_BLOCK_SIZE_RGB16:ud            // Block width and height (32x8)
+    mov (8) mMSGHDR<1>:ud       rMSGSRC<8;8,1>:ud
+//Use the mask to determine which pixels shouldn't be over-written
+    and (1)        acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud   0x00FFFFFF:ud
+    cmp.ge.f0.0(1) dNULLREG     acc0.0<0;1,0>:ud         0x00FFFFFF:ud   //Check if all pixels in the block need to be modified
+    (f0.0)  jmpi WriteRGB16ToDataPort
+    //If mask is not all 1's, then load the entire 32x8 block
+    //so that only those bytes may be modified that need to be (using the mask)
+    // Load 32x8 packed RGB565 -----------------------------------------------------
+    send (8) udSRC_RGB16(0)<1>  mMSGHDR     udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_RGB16+nBI_DESTINATION_RGB:ud
+    mov (8) mMSGHDR<1>:ud       rMSGSRC<8;8,1>:ud
+    //Merge the data
+    mov (1)           f0.0:uw             ubBLOCK_MASK_V:ub    //Load the mask on flag reg
+    (f0.0)  mov (8)   rMASK_TEMP<1>:uw    uwBLOCK_MASK_H:uw    //use sel instruction - vK
+    (-f0.0) mov (8)   rMASK_TEMP<1>:uw    0:uw
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (1)             f0.1:uw                   uwMASK_TEMP(0,%1)<0;1,0>
+        (-f0.1) mov (16)    uwCSC_TEMP(%1)<1>         uwSRC_RGB16(%1)
+    }
+WriteRGB16ToDataPort:
+    // Move packed data to MRF and output
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (8) mudMSGPAYLOAD(%1)<1>       udCSC_TEMP(%1)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_RGB16+nBI_DESTINATION_RGB:ud
+// End of RGB16x8_Save_RGB16
--- a/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB16.inc
+++ b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB16.inc
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: RGB16x8_Save_RGB16.inc
+//
+#include "undefall.inc"                 //Undefine the SRC and DEST sysmbols
+// For saving
+#define nDPW_BLOCK_SIZE_RGB16    nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8    // RGB16 block size 32x8
+#define nDPW_MSG_SIZE_RGB16      nMSGLEN_8                          // # of MRF's to hold RGB16 block data (8)
+// For conversion to 16bit
+.declare	uwTEMP_RGB16    Base=REG(r,nTEMP1)	ElementSize=2 SrcRegion=<16;16,1>	Type=uw		//1 GRF
+// For masking
+#undef  nDPR_MSG_SIZE_RGB16
+#define nDPR_MSG_SIZE_RGB16      nRESLEN_8                          // # of MRF's to hold ARGB block data (8)
+#define     rMASK_TEMP     REG(r,nTEMP0)
+.declare    uwMASK_TEMP    Base=rMASK_TEMP    ElementSize=2    SrcRegion=<8;8,1>    Type=uw        //1 GRF
+#if (nSRC_REGION==nREGION_1)
+    // For saving
+    #define ubDEST_ARGB      ubTOP_Y       //Data from previous module
+    #define uwDEST_ARGB      uwTOP_Y       //Data from previous module
+    #define udCSC_TEMP       udBOT_Y_IO    //Data Converted to 16 bits
+    #define uwCSC_TEMP       uwBOT_Y
+    //For masking operation
+    #define udSRC_RGB16      udTOP_Y_IO    //To hold the destination data that shouldn't be modified
+    #define uwSRC_RGB16      uwTOP_Y
+#elif (nSRC_REGION==nREGION_2)
+    // For saving
+    #define ubDEST_ARGB      ubBOT_Y       //Data from previous module
+    #define uwDEST_ARGB      uwBOT_Y       //Data from previous module
+    #define udCSC_TEMP       udTOP_Y_IO    //Data Converted to 16 bits
+    #define uwCSC_TEMP       uwTOP_Y
+    //For masking operation
+    #define udSRC_RGB16      udBOT_Y_IO    //To hold the destination data that shouldn't be modified
+    #define uwSRC_RGB16      uwBOT_Y
+#endif
--- a/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_Y416.asm
+++ b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_Y416.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: RGB16x8_Save_Y416.asm
+//
+// Save packed ARGB 444 frame data block of size 16x8
+//
+// To save 16x8 block (128x8 byte layout for ARGB 16bit per component) we need 4 send instructions
+//  -----------------
+//  | 1 | 2 | 3 | 4 |
+//  ----------------- 
+#include "RGB16x8_Save_RGB.inc"
+    shl (1) rMSGSRC.0<1>:d      wORIX<0;1,0>:w            3:w  { NoDDClr }             // H. block origin need to become 8 times
+    mov (1) rMSGSRC.1<1>:d      wORIY<0;1,0>:w                 { NoDDClr, NoDDChk }    // Block origin (1st quadrant)
+    mov (1) rMSGSRC.2<1>:ud     nDPW_BLOCK_SIZE_ARGB:ud        { NoDDChk }             // Block width and height (32x8)
+    mov (8) mMSGHDR<1>:ud       rMSGSRC<8;8,1>:ud
+/*	Not needed for validation kernels for now -vK
+//Use the mask to determine which pixels shouldn't be over-written
+    and (1)        acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud   0x00FFFFFF:ud
+    cmp.ge.f0.0(1) dNULLREG     acc0.0<0;1,0>:ud         0x00FFFFFF:ud   //Check if all pixels in the block need to be modified
+    (f0.0)  jmpi WriteARGBToDataPort
+    //If mask is not all 1's, then load the entire 64x8 block
+    //so that only those bytes may be modified that need to be (using the mask)
+    // Load first block 16x8 packed ARGB 444 ---------------------------------------
+    or (1)         acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud   0xFF00FF00:ud   //Check first block
+    cmp.e.f0.0 (1) dNULLREG     acc0.0<0;1,0>:ud         0xFFFFFFFF:ud   
+    (f0.0)  jmpi SkipFirstBlockMerge                                     //If full mask then skip this block
+    send (8) udSRC_ARGB(0)<1>   mMSGHDR     udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
+    mov  (8) mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+    //Merge the data
+    mov (1)           f0.0:uw             ubBLOCK_MASK_V:ub    //Load the mask on flag reg
+    (f0.0)  mov (8)   rMASK_TEMP<1>:uw    uwBLOCK_MASK_H:uw    //use sel instruction - vK
+    (-f0.0) mov (8)   rMASK_TEMP<1>:uw    0:uw
+    $for(0, 0; <nY_NUM_OF_ROWS; 1, 2) {               //take care of the lines in the block, they are different in the src and dest
+        mov (1)             f0.1:uw                   uwMASK_TEMP(0,%1)<0;1,0>
+        (-f0.1) mov (8)     udDEST_ARGB(%2)<1>        udSRC_ARGB(%1) 
+    }
+SkipFirstBlockMerge:
+    // Load second block 16x8 packed ARGB 444 ---------------------------------------
+    or (1)         acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud   0xFF0000FF:ud   //Check second block
+    cmp.e.f0.0 (1) dNULLREG     acc0.0<0;1,0>:ud         0xFFFFFFFF:ud   
+    (f0.0)  jmpi WriteARGBToDataPort                                     //If full mask then skip this block
+    add  (1) mMSGHDR.0<1>:d     rMSGSRC.0<0;1,0>:d       32:d     // Point to 2nd part
+    send (8) udSRC_ARGB(0)<1>   mMSGHDR    udDUMMY_NULL  nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
+    mov  (8) mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud                 // Point to 1st part again
+    //Merge the data
+    mov (1)           f0.0:uw             ubBLOCK_MASK_V:ub    //Load the mask on flag reg
+    (f0.0)  shr (8)   rMASK_TEMP<1>:uw    uwBLOCK_MASK_H:uw    8:uw    //load the mask for second block
+    (-f0.0) mov (8)   rMASK_TEMP<1>:uw    0:uw
+    $for(0, 1; <nY_NUM_OF_ROWS; 1, 2) {               //take care of the lines in the block, they are different in the src and dest
+        mov (1)             f0.1:uw                   uwMASK_TEMP(0,%1)<0;1,0>
+        (-f0.1) mov (8)     udDEST_ARGB(%2)<1>        udSRC_ARGB(%1) 
+    }
+*/
+WriteARGBToDataPort:
+    // Move packed data to MRF and output
+    //Write 1st 4X8 pixels  
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (8) mudMSGPAYLOAD(%1)<1>       udDEST_ARGB(%1*4)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
+	//Write 2nd 4X8 pixels  
+    mov  (8)    mMSGHDR<1>:ud         rMSGSRC<8;8,1>:ud
+    add  (1)    mMSGHDR.0<1>:d        rMSGSRC.0<0;1,0>:d       32:d   // Point to 2nd part
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (8) mudMSGPAYLOAD(%1)<1>       udDEST_ARGB(%1*4+1)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
+	//Write 3rd 4X8 pixels  
+    mov  (8)    mMSGHDR<1>:ud         rMSGSRC<8;8,1>:ud
+    add  (1)    mMSGHDR.0<1>:d        rMSGSRC.0<0;1,0>:d       64:d   // Point to 2nd part
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (8) mudMSGPAYLOAD(%1)<1>       udDEST_ARGB(%1*4+2)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
+	//Write 4th 4X8 pixels  
+    mov  (8)    mMSGHDR<1>:ud         rMSGSRC<8;8,1>:ud
+    add  (1)    mMSGHDR.0<1>:d        rMSGSRC.0<0;1,0>:d       96:d   // Point to 2nd part
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (8) mudMSGPAYLOAD(%1)<1>       udDEST_ARGB(%1*4+3)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
+// End of RGB16x8_Save_Y416
--- a/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_Y416.inc
+++ b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_Y416.inc
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: RGB16x8_Save_Y416.inc
+//
+#include "undefall.inc"                 //Undefine the SRC and DEST sysmbols
+// For saving
+#define nDPW_BLOCK_SIZE_ARGB     nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8    // ARGB block size 32x8
+#define nDPW_MSG_SIZE_ARGB       nMSGLEN_8                          // # of MRF's to hold ARGB block data (8)
+// For masking
+#undef  nDPR_MSG_SIZE_ARGB
+#define nDPR_MSG_SIZE_ARGB       nRESLEN_8                          // # of MRF's to hold ARGB block data (8)
+#define     rMASK_TEMP     REG(r,nTEMP0)
+.declare    uwMASK_TEMP    Base=rMASK_TEMP    ElementSize=2    SrcRegion=<8;8,1>    Type=uw        //1 GRF
+#if (nSRC_REGION==nREGION_1)
+    // For saving
+    #define udDEST_ARGB      udTOP_Y_IO    //The output of previous stage is stored here; This is modified and is written to render cache
+    //For masking operation
+    #define udSRC_ARGB       udBOT_Y_IO    //To hold the destination data that shouldn't be modified
+#elif (nSRC_REGION==nREGION_2)
+    // For saving
+    #define udDEST_ARGB      udBOT_Y_IO    //The output of previous stage is stored here; This is modified and is written to render cache
+    //For masking operation
+    #define udSRC_ARGB       udTOP_Y_IO    //To hold the destination data that shouldn't be modified
+#endif
--- a/i965_drv_video/shaders/post_processing/Common/RGB_Pack.asm
+++ b/i965_drv_video/shaders/post_processing/Common/RGB_Pack.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+.declare SRC_B		Base=REG(r,10)	ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+.declare SRC_G		Base=REG(r,18)	ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+.declare SRC_R		Base=REG(r,26)	ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+.declare SRC_A		Base=REG(r,34)	ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+#define DEST_ARGB		ubBOT_ARGB
+#undef 	nSRC_REGION
+#define nSRC_REGION		nREGION_2
+//Pack directly to mrf as optimization - vK
+$for(0, 0; <8; 1, 2) {
+//	mov	(16) 	DEST_ARGB(%2,0)<4>		SRC_B(%1) 					{ Compr, NoDDClr }			// 16 B
+//	mov	(16) 	DEST_ARGB(%2,1)<4>		SRC_G(%1)					{ Compr, NoDDClr, NoDDChk }	// 16 G
+//	mov	(16) 	DEST_ARGB(%2,2)<4>		SRC_R(%1)					{ Compr, NoDDClr, NoDDChk }	// 16 R	//these 2 inst can be merged - vK
+//	mov	(16) 	DEST_ARGB(%2,3)<4>		SRC_A(%1)					{ Compr, NoDDChk }			//DEST_RGB_FORMAT<0;1,0>:ub	{ Compr, NoDDChk }			// 16 A
+	mov	(8) 	DEST_ARGB(%2,  0)<4>		SRC_B(%1) 					{ NoDDClr }				// 8 B
+	mov	(8) 	DEST_ARGB(%2,  1)<4>		SRC_G(%1)					{ NoDDClr, NoDDChk }	// 8 G
+	mov	(8) 	DEST_ARGB(%2,  2)<4>		SRC_R(%1)					{ NoDDClr, NoDDChk }	// 8 R
+	mov	(8) 	DEST_ARGB(%2,  3)<4>		SRC_A(%1)					{ NoDDChk }				// 8 A
+	mov	(8) 	DEST_ARGB(%2+1,0)<4>		SRC_B(%1,8) 				{ NoDDClr }				// 8 B
+	mov	(8) 	DEST_ARGB(%2+1,1)<4>		SRC_G(%1,8)					{ NoDDClr, NoDDChk }	// 8 G
+	mov	(8) 	DEST_ARGB(%2+1,2)<4>		SRC_R(%1,8)					{ NoDDClr, NoDDChk }	// 8 R
+	mov	(8) 	DEST_ARGB(%2+1,3)<4>		SRC_A(%1,8)					{ NoDDChk }				// 8 A
+}
--- a/i965_drv_video/shaders/post_processing/Common/SetupVPKernel.asm
+++ b/i965_drv_video/shaders/post_processing/Common/SetupVPKernel.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Modual name: SetupVPKernel.asm
+//
+// Initial setup for running video-processing kernels
+//
+#include "common.inc"
+//
+//  Now, begin source code....
+//
+.code
+#include "Init_All_Regs.asm"
+mov (8)     rMSGSRC.0<1>:ud  r0.0<8;8,1>:ud  // Initialize message payload header with R0
+#if	defined (INC_BLENDING)
+    mul	(1)	fALPHA_STEP_X:f   fSCALING_STEP_RATIO:f 	fVIDEO_STEP_X:f	//StepX_ratio = AlphaStepX / VideoStepX
+#endif
+// End of SetupVPKernel
--- a/i965_drv_video/shaders/post_processing/Common/common.inc
+++ b/i965_drv_video/shaders/post_processing/Common/common.inc
--- a/i965_drv_video/shaders/post_processing/Common/readSampler16x1.asm
+++ b/i965_drv_video/shaders/post_processing/Common/readSampler16x1.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: readSampler16x1.asm
+//
+// Read one row of pix through sampler
+//
+//#define SAMPLER_MSG_DSC		0x166A0000	// ILK Sampler Message Descriptor
+// Send Message [DevILK]                                Message Descriptor
+//  MBZ MsgL=5 MsgR=8                            H MBZ   SIMD     MsgType   SmplrIndx BindTab
+//  000 0 101 0 1000                             1  0     10     0000         0000    00000000
+//    0     A    8                                     A             0             0     0     0
+//     MsgL=1+2*2(u,v)=5 MsgR=8
+#define SAMPLER_MSG_DSC		0x0A8A0000	// ILK Sampler Message Descriptor
+	// Assume MSGSRC is set already in the caller
+        //mov (8)		rMSGSRC.0<1>:ud			0:ud	// Unused fileds
+	// Read 16 sampled pixels and stored them in float32 in 8 GRFs
+	// 422 data is expanded to 444, return 8 GRF in the order of RGB- (UYV-).
+	// 420 data has three surfaces, return 8 GRF. Valid is always in the 1st GRF when in R8.  Make sure no overwrite the following 3 GRFs.
+	// alpha data is expanded to 4444, return 8 GRF in the order of RGBA (UYVA).
+    mov(16)     mMSGHDR<1>:uw   rMSGSRC<16;16,1>:uw
+    send (16)	DATABUF(0)<1>	mMSGHDR		udDUMMY_NULL	0x2 SAMPLER_MSG_DSC+SAMPLER_IDX+BINDING_IDX:ud
--- a/i965_drv_video/shaders/post_processing/Common/undefall.inc
+++ b/i965_drv_video/shaders/post_processing/Common/undefall.inc
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Modual name: undefall.inc
+//
+// undefine all global symbol for new process
+//
+//Source definitions
+#undef  ubSRC_Y   
+#undef  ubSRC_U   
+#undef  ubSRC_V 
+#undef  ub2SRC_Y   
+#undef  ub2SRC_U   
+#undef  ub2SRC_V
+#undef  ub4SRC_Y   
+#undef  ub4SRC_U   
+#undef  ub4SRC_V
+#undef  uwSRC_Y   
+#undef  uwSRC_U   
+#undef  uwSRC_V
+#undef  udSRC_Y   
+#undef  udSRC_U   
+#undef  udSRC_V
+#undef  udSRC_YUV
+#undef  nSRC_YUV_REG
+//Destination definitions
+#undef  ubDEST_Y   
+#undef  ubDEST_U   
+#undef  ubDEST_V 
+#undef  ub2DEST_Y   
+#undef  ub2DEST_U   
+#undef  ub2DEST_V
+#undef  ub4DEST_Y   
+#undef  ub4DEST_U   
+#undef  ub4DEST_V
+#undef  uwDEST_Y   
+#undef  uwDEST_U   
+#undef  uwDEST_V
+#undef  udDEST_Y   
+#undef  udDEST_U   
+#undef  udDEST_V
+#undef  udDEST_YUV
+#undef  nDEST_YUV_REG
+#undef  ubDEST_ARGB
+// End of undefall.inc
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_IEF.inc
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_IEF.inc
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: AVS_IEF.inc
+#ifndef _AVS_INF_INC_
+#define _AVS_INF_INC_
+#include "undefall.inc"             //Undefine the SRC and DEST sysmbols
+        // Message Header
+        // m0.7         31:0    Debug
+        // m0.6         31:0    Debug
+        // m0.5         31:0    Ignored
+        // m0.4         31:0    Ignored
+        // m0.3         31:0    Ignored
+        // m0.2         31:16   Ignored
+        //              15      Alpha Write Channel Mask        enable=0, disable=1
+        //              14      Blue Write Channel Mask  (V)    
+        //              13      Green Write Channel Mask (Y)
+        //              12      Red Write Channel Mask   (U)
+        //              11:0    Ignored
+        // m0.1                 Ignored
+        // m0.0                 Ignored
+#define mAVS_8x8_HDR   m0               // Message Header
+#define mAVS_PAYLOAD   m1               // Message Payload Header
+#define mAVS_8x8_HDR_2   m2               // Message Header
+#define mAVS_PAYLOAD_2   m3               // Message Payload Header
+#define mAVS_8x8_HDR_UV   m2               // Message Header
+#define mAVS_PAYLOAD_UV   m3               // Message Payload Header
+#define rAVS_8x8_HDR   rMSGSRC          // Mirror of Message Header 
+#define rAVS_PAYLOAD   r9               // Mirror of Message Payload Header
+        // AVS payload
+        // m1.7                 Ignored
+        // m1.6                 Pixel 0 V Address       ---> ORIY (Y0)
+        // m1.5                 Delta V                 ---> Step Y
+        // m1.4                 Ignored
+        // m1.3                 Ignored
+        // m1.2                 Pixel 0 U Address       ---> ORIX (X0)
+        // m1.1                 U 2nd Derivative        ---> NLAS dx 
+        // m1.0                 Delta U                 ---> Step X
+        // Sampler Message Descriptor
+        // 31:29        Reserved                        000
+        // 28:25        Message length                  0010
+        // 24:20        Response length                 xxxxx   ---> 4GRFs for each enabled channel
+        // 19           Header Present                  1
+        // 18           MBZ                             0
+        // 17:16        SIMD Mode                       11      ---> SIMD64
+        // 15:12        Message Type                    0011    ---> sample_8x8
+        // 11:8         Sampler Index                   xxxx
+        // 7:0          Binding Table Index             xxxxxxxx
+#define nAVS_MSG_DSC_1CH        0x044BB000  
+#define nAVS_MSG_DSC_2CH        0x048BB000
+#define nAVS_MSG_DSC_3CH        0x04CBB000      
+#define nAVS_MSG_DSC_4CH        0x050BB000 
+#define nAVS_RED_CHANNEL_ONLY   0x0000E000      // Enable Red channel only
+#define nAVS_GREEN_CHANNEL_ONLY 0x0000D000      // Enable Green channel only
+#define nAVS_RED_BLUE_CHANNELS  0x0000A000      // Enable Red and Blue channels
+#define nAVS_RGB_CHANNELS       0x00008000      // Enable RGB(YUV) channels
+#define nAVS_ALL_CHANNELS       0x00000000      // Enable all channels (ARGB\AYUV)
+.declare     ubAVS_RESPONSE  Base=REG(r,nTEMP8) ElementSize=1  SrcRegion=REGION(16,1) Type=ub
+.declare     uwAVS_RESPONSE  Base=REG(r,nTEMP8) ElementSize=2  SrcRegion=REGION(16,1) Type=uw
+.declare     ubAVS_RESPONSE_2  Base=REG(r,nTEMP24) ElementSize=1  SrcRegion=REGION(16,1) Type=ub
+.declare     uwAVS_RESPONSE_2  Base=REG(r,nTEMP24) ElementSize=2  SrcRegion=REGION(16,1) Type=uw
+#if (nSRC_REGION==nREGION_2)
+    #define uwDEST_Y        uwBOT_Y
+    #define uwDEST_U        uwBOT_U
+    #define uwDEST_V        uwBOT_V
+    #define ubDEST_Y        ubBOT_Y
+    #undef  nSRC_REGION
+    #define nSRC_REGION nREGION_2
+#else //(nSRC_REGION==nREGION_1)
+    #define uwDEST_Y        uwTOP_Y
+    #define uwDEST_U        uwTOP_U
+    #define uwDEST_V        uwTOP_V
+    #define ubDEST_Y        ubTOP_Y
+    #undef  nSRC_REGION
+    #define nSRC_REGION     nREGION_1
+#endif
+#endif //_AVS_INF_INC_
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_SetupFirstBlock.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_SetupFirstBlock.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//------------------------------------------------------------------------------
+// AVS_SetupFirstBlock.asm
+//------------------------------------------------------------------------------
+    // Setup Message Header
+//    mov (8) mAVS_8x8_HDR<1>:ud      rMSGSRC<8;8,1>:ud                                                     
+    // Check  NLAS Enable bit
+    and.z.f0.0	(1)	wNULLREG                uwNLAS_ENABLE:uw	BIT15:uw	
+    (f0.0)mov   (1) fVIDEO_STEP_DELTA:f     0.0:f   
+    // Setup Message Payload Header for 1st block of Media Sampler 8x8
+    mov (1) rAVS_PAYLOAD.0:f        fVIDEO_STEP_DELTA:f     //NLAS dx
+    mov (1) rAVS_PAYLOAD.1:f        fVIDEO_STEP_X:f         //Step X 
+    mov (1) rAVS_PAYLOAD.5:f        fVIDEO_STEP_Y:f         //Step Y 
+    mov (2) rAVS_PAYLOAD.2<4>:f     fSRC_VID_H_ORI<2;2,1>:f //Orig X and Y 
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_SetupSecondBlock.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_SetupSecondBlock.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//------------------------------------------------------------------------------
+// AVS_SetupSecondBlock.asm
+//------------------------------------------------------------------------------
+    //NLAS calculations for 2nd block of Media Sampler 8x8: 
+    // X(i) = X0 + dx*i + ddx*i*(i-1)/2   ==>  X(8) = X0 + dx*8 +ddx*28
+    // dx(i)= dx(0) + ddx*i               ==>  dx(8)= dx + ddx*8
+    // Calculating X(8)
+    mov (1)   acc0.2<1>:f           fSRC_VID_H_ORI:f                         
+    mac (1)   acc0.2<1>:f           fVIDEO_STEP_X:f          8.0:f           
+    mac (1)   rAVS_PAYLOAD.2:f      fVIDEO_STEP_DELTA:f      28.0:f                    
+    // Calculating dx(8)
+    mov (1)   acc0.1<1>:f           fVIDEO_STEP_X:f                         
+    mac (1)   rAVS_PAYLOAD.1:f      fVIDEO_STEP_DELTA:f      8.0:f
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/DI.inc
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/DI.inc
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: DI.inc
+#ifdef GT
+// GT DI Kernel
+#else // ILK
+// ILK DI Kernel
+#endif
+//---------------------------------------------------------------------------
+// Binding table indices
+//---------------------------------------------------------------------------
+#define nBIDX_DI_PRV		10		// Previous DI-ed frame
+#define nBIDX_DI_CUR		13		// Current DI-ed frame
+#define	nBIDX_DN			7		// Denoised frame
+#define	nBIDX_STAT			20		// Statistics 
+#define nBIDX_DI_Source  4  // Source Surface
+//---------------------------------------------------------------------------
+// Message descriptors
+//---------------------------------------------------------------------------
+// Extended message descriptor
+#define nSMPL_ENGINE		0x2
+#define nDATAPORT_WRITE		0x5
+#define nTS_EOT				0x27	// with End-Of-Thread bit ON
+		// Message descriptor for end-of-thread
+		//						= 000 0001 (message len) 00000 (resp len)
+		//						  0 (header present 0) 00000000000000 0 (URB dereferenced) 0000
+#define nEOT_MSGDSC			0x02000000
+		// Message descriptor for sampler read
+		//						= 000 0010 (message len 2) 00000 (resp len - set later, 12 or 5 or 11)  
+		//						  1 (header present 1) 0 11 (SIMD32/64 mode) 
+		//						  1000 (message type) 0000 (DI state index) 
+		//						  00000000 (binding table index - set later)
+		//						= 0x040b8000
+// comment begin
+// The following is commented out because of walker feature
+// It corresponds to the #ifdef GT #else and #endif
+//#define nSMPL_MSGDSC		    0x040b8000
+//#define nSMPL_RESP_LEN_DI	    0x00c00000		// 12
+//#define nSMPL_RESP_LEN_NODI_PL  0x00500000		// 5
+//#define nSMPL_RESP_LEN_NODI_PA  0x00900000		// 9
+//#define nSMPL_RESP_LEN_NODN	    0x00900000		// 9
+//#define nSMPL_RESP_LEN_PDI	    0x00b00000		// 11
+// comment end
+#ifdef GT
+#define nSMPL_MSGDSC		    0x040b8000
+#define nSMPL_RESP_LEN_DI	    0x00c00000		// 12
+#define nSMPL_RESP_LEN_NODI_PL  0x00500000		// 5  //DI disable, the XY stored in 5th GRF, no impact to return length
+#define nSMPL_RESP_LEN_NODI_PA  0x00900000		// 9  //DI disable, the XY stored in 5th GRF, no impact to return length
+#define nSMPL_RESP_LEN_NODN	    0x00a00000		// 10 //NO DN, originally use 9, now we need use 10 to store the XY with walker
+#define nSMPL_RESP_LEN_PDI	    0x00b00000		// 11
+#else
+#define nSMPL_MSGDSC		    0x040b8000
+#define nSMPL_RESP_LEN_DI	    0x00c00000		// 12
+#define nSMPL_RESP_LEN_NODI_PL  0x00500000		// 5
+#define nSMPL_RESP_LEN_NODI_PA  0x00900000		// 9
+#define nSMPL_RESP_LEN_NODN	    0x00900000		// 9
+#define nSMPL_RESP_LEN_PDI	    0x00b00000		// 11
+#endif
+		// Message descriptor for dataport media write
+#ifdef GT
+		//						= 000 0000 (message len - set later) 00000 (resp len 0) 		
+		//						  1 (header present 1) 0 0 1010 (media block write) 00000
+		//						  00000000 (binding table index - set later)
+		//						= 0x00094000
+#define nDPMW_MSGDSC		    0x00094000
+#else // ILK
+		//						= 000 0000 (message len - set later) 00000 (resp len 0) 		
+		//						  1 (header present 1) 000 0 010 (media block write) 0000
+		//						  00000000 (binding table index - set later)
+		//						= 0x00082000
+#define nDPMW_MSGDSC		    0x00082000
+#endif
+#define nDPMW_MSG_LEN_STMM	    0x04000000		// 2 - STMM
+#define nDPMW_MSG_LEN_DH	    0x04000000		// 2 - Denoise history
+#define nDPMW_MSG_LEN_PA_DN	    0x0a000000		// 5 - Denoised output
+#define nDPMW_MSG_LEN_PA_NODI	0x12000000		// 9 - Denoised output - denoise only - DI disabled
+#define nDPMW_MSG_LEN_PL_DN	    0x06000000		// 3 - Denoised output
+#define nDPMW_MSG_LEN_PL_NODI	0x0a000000		// 5 - Denoised output - denoise only - DI disabled
+#define nDPMW_MSG_LEN_DI	    0x0a000000		// 5 - DI output
+//---------------------------------------------------------------------------
+// Static and inline parameters
+//---------------------------------------------------------------------------
+// Static parameters
+.declare ubTFLD_FIRST		Base=r1.27	ElementSize=1 Type=ub	// top field first
+.declare ubSRCYUVOFFSET		Base=r1.4	ElementSize=1 Type=ub	// source packed format
+.declare ubDSTYUVOFFSET		Base=r1.8	ElementSize=1 Type=ub	// destination packed format
+.declare uwSPITCH_DIV2		Base=r1.10	ElementSize=2 Type=uw	// statistics surface pitch divided by 2
+// Inline parameters
+.declare uwXORIGIN			Base=r5.0	ElementSize=2 Type=uw	// X and Y origin
+.declare uwYORIGIN			Base=r5.1	ElementSize=2 Type=uw
+//---------------------------------------------------------------------------
+// Kernel GRF variables 
+//---------------------------------------------------------------------------
+// Message response (Denoised & DI-ed pixels & statistics)
+.declare dRESP						Base=r8		ElementSize=4 Type=d	// Response message (12 or 5 or 11)
+.declare ubRESP						Base=r8		ElementSize=1 Type=ub	
+.declare dSTMM						Base=r16	ElementSize=4 Type=d	// STMM
+.declare ubDN_HIST_NODI		Base=r12	ElementSize=1 Type=ub	// Denoise history data (DI disabled)
+.declare ubDN_HIST_DI			Base=r17	ElementSize=1 Type=ub	// Denoise history data (DI enabled)
+.declare uwRETURNED_POSITION_DI	Base=r17	ElementSize=2 Type=uw	// XY_Return_Data (DI enabled)
+.declare uwRETURNED_POSITION_DN	Base=r12	ElementSize=2 Type=uw // XY_Return_Data (DI disabled)
+.declare ub1ST_FLD_DN			Base=r12	ElementSize=1 Type=ub	// 1st field Denoised data (DI enabled)
+.declare d1ST_FLD_DN			Base=r12	ElementSize=4 Type=d
+.declare ub2ND_FLD_DN			Base=r18	ElementSize=1 Type=ub	// 2nd field Denoised data (DI enabled)	
+.declare d2ND_FLD_DN			Base=r18	ElementSize=4 Type=d
+.declare ubPRV_DI					Base=r8		ElementSize=1 Type=ub	// Previous frame DI (DI enabled)
+.declare ubCUR_DI					Base=r12	ElementSize=1 Type=ub	// Previous frame DI (DI enabled)
+// Packed denoised output
+.declare ubDN_YUV					Base=r22	ElementSize=1 Type=ub	// Denoised YUV422
+.declare dDN_YUV					Base=r22	ElementSize=4 Type=d
+#define	 npDN_YUV			704									// = 22*32 = 0x280
+// Packed DI output
+.declare dDI_YUV_PRV			Base=r32	ElementSize=4 Type=d	// Previous frame DI output
+.declare dDI_YUV_CUR			Base=r36	ElementSize=4 Type=d	// Current frame DI output
+#define	 npDI_YUV			1024									// = 32*32 = 0x 
+// For packed output
+#define	 p422_YOFFSET		a0.2	
+#define	 p422_UOFFSET		a0.3	
+#define	 p422_VOFFSET		a0.4
+#define	 pDN_TFLDSRC		a0.6	
+#define	 pDN_BFLDSRC		a0.7	
+#define	 npRESP				192									// = 6*32
+// Message source
+.declare udMSGSRC					Base=r70	  ElementSize=4 Type=ud
+.declare uwMSGSRC					Base=r70	  ElementSize=2 Type=uw
+.declare dMSGSRC          Base=r70    ElementSize=4 Type=d
+//---------------------------------------------------------------------------
+// Kernel MRF variables 
+//---------------------------------------------------------------------------
+#define	mMSGHDR_SMPL		m1									// Sampler response: m1~m2
+.declare mudMSGHDR_SMPL		Base=m1		ElementSize=4 Type=ud
+.declare muwMSGHDR_SMPL		Base=m1		ElementSize=2 Type=uw
+#define	mMSGHDR_DN			m3									// Denoise output: m3~m7 for PA, m3~m5 for PL
+.declare mdMSGHDR_DN		Base=m3		ElementSize=4 Type=d
+#define	mMSGHDR_STAT		m8									// Statistics output: m8~m9
+.declare mdMSGHDR_STAT		Base=m8		ElementSize=4 Type=d
+.declare mubMSGHDR_STAT		Base=m8		ElementSize=1 Type=ub
+#define	mMSGHDR_DI			m10									// DI output: m10~m14
+.declare mdMSGHDR_DI		Base=m10	ElementSize=4 Type=d
+#define	mMSGHDR_EOT			m15									// EOT
+#ifdef GT
+#define	MSGSRC
+#else
+#define MSGSRC				null:ud
+#endif
+//---------------------------------------------------------------------------
+// End of thread instruction
+//---------------------------------------------------------------------------
+#ifdef GT
+#define END_THREAD			send (8) null<1>:d mMSGHDR_EOT nTS_EOT nEOT_MSGDSC 
+#else	// ILK
+#define END_THREAD			send (8) null<1>:d mMSGHDR_EOT null:ud	nTS_EOT nEOT_MSGDSC
+#endif
+// end of DI.inc
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/DI_Hist_Save.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/DI_Hist_Save.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Write denoise history to memory
+shr (2)    rMSGSRC.0<1>:ud    wORIX<2;2,1>:w            2:w                      NODDCLR           // X,Y origin / 4
+add (1)    rMSGSRC.0<1>:ud    rMSGSRC.0<0;1,0>:ud       uwSPITCH_DIV2<0;1,0>:uw  NODDCLR_NODDCHK  // Add pitch to X origin
+mov (1)    rMSGSRC.2<1>:ud    nDPW_BLOCK_SIZE_HIST:ud                            NODDCHK           // block width and height (4x2)
+mov (8)    mMSGHDR_HIST<1>:ud      rMSGSRC.0<8;8,1>:ud                   // message header   
+mov (1)    mudMSGHDR_HIST(1)<1>    udRESP(nDI_HIST_OFFSET,0)<0;1,0>    // Move denoise history to MRF
+send (8)   dNULLREG    mMSGHDR_HIST    udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_HIST+nBI_STMM_HISTORY_OUTPUT:ud
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/DI_SAVE_PA.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/DI_SAVE_PA.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+    shl (1) rMSGSRC.0<1>:ud     wORIX<0;1,0>:w            1:w  NODDCLR             // H. block origin need to be doubled
+    mov (1) rMSGSRC.1<1>:ud     wORIY<0;1,0>:w                 NODDCLR_NODDCHK    // Block origin
+    mov (1) rMSGSRC.2<1>:ud     nDPW_BLOCK_SIZE_DI:ud          NODDCHK             // Block width and height (32x8)
+	add (4) pCF_Y_OFFSET<1>:uw   ubDEST_CF_OFFSET<4;4,1>:ub   nDEST_YUV_REG*nGRFWIB:w    // Initial Y,U,V offset in YUV422 block
+	// Pack 2nd field Y
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+		mov     (16) r[pCF_Y_OFFSET, %1*nGRFWIB]<2>       ubRESP(nDI_PREV_FRAME_LUMA_OFFSET,%1*16)
+    }
+	// Pack 1st field Y
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+		mov     (16) r[pCF_Y_OFFSET, %1+4*nGRFWIB]<2>       ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16)
+    }
+	// Pack 2nd field U
+    $for(0; <nUV_NUM_OF_ROWS; 1) {
+        mov (8) r[pCF_U_OFFSET,   %1*nGRFWIB]<4>  ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+    }
+	 // Pack 1st field U
+    $for(0; <nUV_NUM_OF_ROWS; 1) {
+        mov (8) r[pCF_U_OFFSET,   %1+4*nGRFWIB]<4>  ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+    }
+	// Pack 2nd field V
+    $for(0; <nUV_NUM_OF_ROWS; 1) {
+        mov (8) r[pCF_V_OFFSET,   %1*nGRFWIB]<4>  ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>  //Vpixels
+    }
+	// Packs1st field V
+    $for(0; <nUV_NUM_OF_ROWS; 1) {
+        mov (8) r[pCF_V_OFFSET,   %1+4*nGRFWIB]<4>  ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>  //Vpixels
+    }
+    //save the previous frame
+    mov (8) mMSGHDR<1>:ud       rMSGSRC<8;8,1>:ud
+    $for(0; <4; 1) {
+            mov (8) mudMSGPAYLOAD(%1)<1>  udDEST_YUV(%1)REGION(8,1)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_DI+nBI_DESTINATION_1_YUV:ud
+    //save the current frame
+    mov (8) mMSGHDR<1>:ud       rMSGSRC<8;8,1>:ud
+    $for(0; <4; 1) {
+            mov (8) mudMSGPAYLOAD(%1)<1>  udDEST_YUV(%1+4)REGION(8,1)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_DI+nBI_DESTINATION_2_YUV:ud
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI.inc
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI.inc
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Module name: DI.inc
+#ifdef GT
+// GT DI Kernel
+#else // ILK
+// ILK DI Kernel
+#endif
+#include "undefall.inc"
+//---------------------------------------------------------------------------
+// Message descriptors
+//---------------------------------------------------------------------------
+// Extended message descriptor
+          // Message descriptor for sampler read
+//        //                      = 000 0010 (message len 2) 00000 (resp len - set later, 12 or 5 or 11)  
+//        //                        1 (header present 1) 0 11 (SIMD32/64 mode) 
+//        //                        1000 (message type) 0000 (DI state index) 
+//        //                        00000000 (binding table index - set later)
+//        //                      = 0x040b8000
+#define nSMPL_DI_MSGDSC           0x040b8000
+#define nSMPL_RESP_LEN_DNDI      nRESLEN_12      // 12 - for DN + DI Alg
+#define nSMPL_RESP_LEN_DN_PL     nRESLEN_5       // 5  - for DN Planar Alg
+#define nSMPL_RESP_LEN_DN_PA     nRESLEN_9       // 9  - for DN Packed Alg
+#define nSMPL_RESP_LEN_DI        nRESLEN_9       // 9  - for DI Only Alg
+#define nSMPL_RESP_LEN_PDI       nRESLEN_11      // 11 - for Partial DI Alg
+// Attention: The Message Length is The Number of GRFs with Data Only, without the Header
+#define nDPMW_MSG_LEN_STMM       nMSGLEN_1       // 1 - For STMM Save
+#define nDPMW_MSG_LEN_HIST       nMSGLEN_1       // 1 - For Denoise History Save
+#define nDPMW_MSG_LEN_PA_DN_DI   nMSGLEN_4       // 4 - For DN Curr Save
+#define nDPMW_MSG_LEN_PA_DN_NODI nMSGLEN_8       // 8 - For DN Curr Save (denoise only - DI disabled)
+#define nDPMW_MSG_LEN_PL_DN_DI   nMSGLEN_2       // 2 - For DN Curr Save
+#define nDPMW_MSG_LEN_PL_DN_NODI nMSGLEN_4       // 4 - For DN Curr Save (denoise only - DI disabled)
+#define nDPW_BLOCK_SIZE_STMM   nBLOCK_WIDTH_8+nBLOCK_HEIGHT_4   // Y block size 8x4
+#undef  nDPW_BLOCK_SIZE_DI
+#undef  nDPW_MSG_SIZE_DI
+#define nDPW_BLOCK_SIZE_DI  nBLOCK_WIDTH_32+nBLOCK_HEIGHT_4    
+#define nDPW_MSG_SIZE_DI    nMSGLEN_4
+//---------------------------------------------------------------------------
+// Kernel GRF variables 
+//---------------------------------------------------------------------------
+// Defines for DI enabled
+#define nDI_PREV_FRAME_LUMA_OFFSET          0
+#define nDI_PREV_FRAME_CHROMA_OFFSET        2
+#define nDI_CURR_FRAME_LUMA_OFFSET          4
+#define nDI_CURR_FRAME_CHROMA_OFFSET        6
+#define nDI_STMM_OFFSET                     8
+#define nDI_HIST_OFFSET                     9
+#define nDI_CURR_2ND_FIELD_LUMA_OFFSET     10
+#define nDI_CURR_2ND_FIELD_CHROMA_OFFSET   11
+// Defines for DI disabled
+#define nNODI_LUMA_OFFSET                   0
+#define nNODI_HIST_OFFSET                   4
+#define nNODI_CHROMA_OFFSET                 5
+#ifdef DI_ENABLE
+    #define nHIST_OFFSET    nDI_HIST_OFFSET
+    #undef  nY_NUM_OF_ROWS
+    #define nY_NUM_OF_ROWS      8       // Number of Y rows per block (4 rows for each frame) 
+    #undef  nUV_NUM_OF_ROWS
+    #define nUV_NUM_OF_ROWS     8       // Number of U/V rows per block
+#endif
+#ifdef DI_DISABLE
+    #define nHIST_OFFSET    nNODI_HIST_OFFSET
+#endif
+#if (nSRC_REGION==nREGION_2)
+    #define ub2SRC_Y      ub2BOT_Y
+    #define ub2SRC_U      ub2BOT_U
+    #define ub2SRC_V      ub2BOT_V
+    #define uwDEST_Y      uwBOT_Y
+    #define uwDEST_U      uwBOT_U
+    #define uwDEST_V      uwBOT_V
+    #define nDEST_YUV_REG nTOP_Y
+    #define udDEST_YUV    udTOP_Y_IO
+    #define nRESP         nTEMP0         // DI return message requires 12 GRFs
+    #define nDN_YUV       nTOP_Y         // Space for Packing DN for next run requires 8 GRFs
+    #undef  nSRC_REGION
+    #define nSRC_REGION   nREGION_2
+#else
+    #define ub2SRC_Y      ub2TOP_Y
+    #define ub2SRC_U      ub2TOP_U
+    #define ub2SRC_V      ub2TOP_V
+    #define uwDEST_Y      uwTOP_Y
+    #define uwDEST_U      uwTOP_U
+    #define uwDEST_V      uwTOP_V
+    #define nDEST_YUV_REG nBOT_Y
+    #define udDEST_YUV    udBOT_Y_IO
+    #define nRESP         nTEMP0         // DI return message requires 12 GRFs
+    #define nDN_YUV       nBOT_Y         // Space for Packing DN for next run requires 8 GRFs
+    #undef  nSRC_REGION
+    #define nSRC_REGION   nREGION_1    // REGION_1 will be the source region for first kernel
+#endif
+// Message response (Denoised & DI-ed pixels & statistics)
+.declare udRESP      Base=REG(r,nRESP) ElementSize=4 SrcRegion=REGION(8,1) DstRegion=<1> Type=ud
+.declare ubRESP      Base=REG(r,nRESP) ElementSize=1 SrcRegion=REGION(16,1) DstRegion=<1> Type=ub
+// For Denoised Curr Output (Used as Priv in Next Run)
+.declare ubDN_YUV           Base=REG(r,nDN_YUV)    ElementSize=1 Type=ub
+.declare udDN_YUV           Base=REG(r,nDN_YUV)    ElementSize=4 Type=ud
+#define  npDN_YUV           nDN_YUV*nGRFWIB                                 
+// For DI Process Output (1st and 2nd Frames Output)
+//.declare udDI_YUV_PRIV      Base=REG(r,nTEMP0)    ElementSize=4 Type=ud   // Previous frame DI output
+//.declare udDI_YUV_CURR      Base=REG(r,nTEMP0)    ElementSize=4 Type=ud   // Current frame DI output
+//#define  npDI_YUV           nTEMP0*nGRFWIB                                  
+//---------------------------------------------------------------------------
+// Kernel MRF variables 
+//---------------------------------------------------------------------------
+#define  mMSG_SMPL           m1                                              // Sampler Command is in: m1~m2
+.declare mudMSG_SMPL         Base=mMSG_SMPL         ElementSize=4 Type=ud
+.declare muwMSG_SMPL         Base=mMSG_SMPL         ElementSize=2 Type=uw
+#define mMSGHDR_DN           m1                                              // Denoise Output: m1~m9 for PA, m3~m5 for PL
+.declare mudMSGHDR_DN        Base=mMSGHDR_DN        ElementSize=4 Type=ud
+.declare mubMSGHDR_DN        Base=mMSGHDR_DN        ElementSize=1 Type=ub
+#define mMSGHDR_STMM         m11                                             // STMM Output: m11~m12
+.declare mudMSGHDR_STMM      Base=mMSGHDR_STMM      ElementSize=4 Type=ud
+#define mMSGHDR_HIST         m13                                             // HIST Output: m13~m14
+.declare mudMSGHDR_HIST      Base=mMSGHDR_HIST      ElementSize=1 Type=ud
+#define mMSGHDR_DI_1ST       m1                                              // DI output: m1~m5
+.declare mudMSGHDR_DI_1ST    Base=mMSGHDR_DI_1ST    ElementSize=4 Type=ud
+#define mMSGHDR_DI_2ND       m6                                              // DI output: m6~m10
+.declare mudMSGHDR_DI_2ND    Base=mMSGHDR_DI_2ND    ElementSize=4 Type=ud
+// end of DNDI.inc
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI_COMMAND.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI_COMMAND.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Activate the DNDI send command
+mov (8)     mudMSG_SMPL(0)<1>        rMSGSRC.0<8;8,1>:ud    NODDCLR         // message header
+mov (1)     muwMSG_SMPL(1,4)<1>      wORIX<0;1,0>:w         NODDCLR_NODDCHK// horizontal origin
+mov (1)     muwMSG_SMPL(1,12)<1>     wORIY<0;1,0>:w         NODDCLR_NODDCHK         // vertical origin
+//mov (2)     muwMSG_SMPL(1,4)<2>      wORIX<2;2,1>:w       NODDCHK// problem during compile !! when using this line
+send (8)    udRESP(0)<1>    mMSG_SMPL  udDUMMY_NULL   nSMPL_ENGINE    nSMPL_DI_MSGDSC+nSMPL_RESP_LEN+nBI_CURRENT_SRC_YUV_HW_DI:ud
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI_Hist_Save.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI_Hist_Save.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+// Write denoise history to memory
+shr (2)    rMSGSRC.0<1>:ud    wORIX<2;2,1>:w            2:w                       NODDCLR         // X,Y origin / 4
+add (1)    rMSGSRC.0<1>:ud    rMSGSRC.0<0;1,0>:ud       uwSPITCH_DIV2<0;1,0>:uw   NODDCLR_NODDCHK// Add pitch to X origin
+mov (1)    rMSGSRC.2<1>:ud    nDPW_BLOCK_SIZE_HIST:ud                             NODDCHK         // block width and height (4x2)
+mov (8)    mMSGHDR_HIST<1>:ud      rMSGSRC.0<8;8,1>:ud                   // message header   
+mov (2)    mudMSGHDR_HIST(1)<1>    udRESP(nNODI_HIST_OFFSET,0)<2;2,1>    // Move denoise history to MRF
+send (8)   dNULLREG    mMSGHDR_HIST    udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_HIST+nBI_STMM_HISTORY_OUTPUT:ud
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_16x8.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_16x8.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PA_AVS_IEF_16x8.asm ----------
+#include "AVS_IEF.inc"
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 YUV packed
+//------------------------------------------------------------------------------
+#include "PA_AVS_IEF_Sample.asm"
+//------------------------------------------------------------------------------
+// Unpacking sampler reads to 4:4:4 internal planar 
+//------------------------------------------------------------------------------
+#include "PA_AVS_IEF_Unpack_16x8.asm"
+//------------------------------------------------------------------------------
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_8x4.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_8x4.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PA_AVS_IEF_8x4.asm ----------
+#include "AVS_IEF.inc"
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 YUV packed
+//------------------------------------------------------------------------------
+#include "PA_AVS_IEF_Sample.asm"
+//------------------------------------------------------------------------------
+// Unpacking sampler data to 4:2:0 internal planar 
+//------------------------------------------------------------------------------
+#include "PA_AVS_IEF_Unpack_8x4.asm"
+//------------------------------------------------------------------------------
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_8x8.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_8x8.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PA_AVS_IEF_8x8.asm ----------
+#include "AVS_IEF.inc"
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 YUV packed
+//------------------------------------------------------------------------------
+#include "PA_AVS_IEF_Sample.asm"
+//------------------------------------------------------------------------------
+// Unpacking sampler data to 4:2:2 internal planar 
+//------------------------------------------------------------------------------
+#include "PA_AVS_IEF_Unpack_8x8.asm"
+//------------------------------------------------------------------------------
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Sample.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Sample.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PA_AVS_IEF_Sample.asm ----------
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 YUV packed
+//------------------------------------------------------------------------------
+    // 1st 8x8 setup
+    #include "AVS_SetupFirstBlock.asm"
+    // Enable RGB(YUV) channels
+    mov (1)  rAVS_8x8_HDR.2:ud      nAVS_RGB_CHANNELS:ud   
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(0)<1>   mAVS_8x8_HDR    udDUMMY_NULL    nSMPL_ENGINE    nAVS_MSG_DSC_3CH+nSI_SRC_YUV+nBI_CURRENT_SRC_YUV
+    // Return YUV in 12 GRFs
+    // 2nd 8x8 setup
+    #include "AVS_SetupSecondBlock.asm"
+    mov (16) mAVS_8x8_HDR_2.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(0)<1> mAVS_8x8_HDR_2    udDUMMY_NULL    nSMPL_ENGINE    nAVS_MSG_DSC_3CH+nSI_SRC_YUV+nBI_CURRENT_SRC_YUV
+    // Return YUV in 12 GRFs
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_16x8.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_16x8.asm
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_8x4.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_8x4.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PA_AVS_IEF_Unpack_8x8.asm ----------
+// Yoni: In order to optimize unpacking, 3 methods are being checked:
+//  1. AVS_ORIGINAL
+//  2. AVS_ROUND_TO_8_BITS  
+//  3. AVS_INDIRECT_ACCESS  
+//
+// Only 1 method should stay in the code 
+//#define AVS_ROUND_TO_8_BITS
+//#define AVS_INDIRECT_ACCESS
+    // Move first 8x8 words of Y to dest GRF
+    mov (8)  uwDEST_Y(0)<1>     ubAVS_RESPONSE(2,1)<16;4,2>                 
+    mov (8)  uwDEST_Y(1)<1>     ubAVS_RESPONSE(2,8+1)<16;4,2>               
+    mov (8)  uwDEST_Y(2)<1>     ubAVS_RESPONSE(3,1)<16;4,2>                 
+    mov (8)  uwDEST_Y(3)<1>     ubAVS_RESPONSE(3,8+1)<16;4,2>               
+    mov (8)  uwDEST_Y(4)<1>     ubAVS_RESPONSE(8,1)<16;4,2>                 
+    mov (8)  uwDEST_Y(5)<1>     ubAVS_RESPONSE(8,8+1)<16;4,2>               
+    mov (8)  uwDEST_Y(6)<1>     ubAVS_RESPONSE(9,1)<16;4,2>                 
+    mov (8)  uwDEST_Y(7)<1>     ubAVS_RESPONSE(9,8+1)<16;4,2>               
+    // Move first 4x8 words of V to dest GRF  
+    mov (4) uwDEST_V(0)<1>      ubAVS_RESPONSE(0,1)<16;2,4>                 
+    mov (4) uwDEST_V(0,8)<1>    ubAVS_RESPONSE(1,1)<16;2,4>                 
+    mov (4) uwDEST_V(1)<1>      ubAVS_RESPONSE(6,1)<16;2,4>                 
+    mov (4) uwDEST_V(1,8)<1>    ubAVS_RESPONSE(7,1)<16;2,4>                 
+    // Move first 4x8 words of U to dest GRF        
+    mov (4) uwDEST_U(0)<1>      ubAVS_RESPONSE(4,1)<16;2,4>           
+    mov (4) uwDEST_U(0,8)<1>    ubAVS_RESPONSE(5,1)<16;2,4>           
+    mov (4) uwDEST_U(1)<1>      ubAVS_RESPONSE(10,1)<16;2,4>          
+    mov (4) uwDEST_U(1,8)<1>    ubAVS_RESPONSE(11,1)<16;2,4>          
+    // Move second 8x8 words of Y to dest GRF
+    mov (8) uwDEST_Y(0,8)<1>    ubAVS_RESPONSE_2(2,1)<16;4,2>    
+    mov (8) uwDEST_Y(1,8)<1>    ubAVS_RESPONSE_2(2,8+1)<16;4,2>
+    mov (8) uwDEST_Y(2,8)<1>    ubAVS_RESPONSE_2(3,1)<16;4,2>     
+    mov (8) uwDEST_Y(3,8)<1>    ubAVS_RESPONSE_2(3,8+1)<16;4,2>
+    mov (8) uwDEST_Y(4,8)<1>    ubAVS_RESPONSE_2(8,1)<16;4,2>  
+    mov (8) uwDEST_Y(5,8)<1>    ubAVS_RESPONSE_2(8,8+1)<16;4,2>
+    mov (8) uwDEST_Y(6,8)<1>    ubAVS_RESPONSE_2(9,1)<16;4,2>     
+    mov (8) uwDEST_Y(7,8)<1>    ubAVS_RESPONSE_2(9,8+1)<16;4,2>
+    // Move second 4x8 words of V to dest GRF        
+    mov (4) uwDEST_V(0,4)<1>    ubAVS_RESPONSE_2(0,1)<16;2,4>           
+    mov (4) uwDEST_V(0,12)<1>   ubAVS_RESPONSE_2(1,1)<16;2,4>           
+    mov (4) uwDEST_V(1,4)<1>    ubAVS_RESPONSE_2(6,1)<16;2,4>           
+    mov (4) uwDEST_V(1,12)<1>   ubAVS_RESPONSE_2(7,1)<16;2,4>           
+    // Move second 4x8 words of U to dest GRF        
+    mov (4) uwDEST_U(0,4)<1>    ubAVS_RESPONSE_2(4,1)<16;2,4>             
+    mov (4) uwDEST_U(0,12)<1>   ubAVS_RESPONSE_2(5,1)<16;2,4>             
+    mov (4) uwDEST_U(1,4)<1>    ubAVS_RESPONSE_2(10,1)<16;2,4>            
+    mov (4) uwDEST_U(1,12)<1>   ubAVS_RESPONSE_2(11,1)<16;2,4>            
+//------------------------------------------------------------------------------
+       // Re-define new number of lines
+       #undef nUV_NUM_OF_ROWS
+       #undef nY_NUM_OF_ROWS
+       #define nY_NUM_OF_ROWS      8
+       #define nUV_NUM_OF_ROWS     8
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_8x8.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_8x8.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PA_AVS_IEF_Unpack_8x8.asm ----------
+// Yoni: In order to optimize unpacking, 3 methods are being checked:
+//  1. AVS_ORIGINAL
+//  2. AVS_ROUND_TO_8_BITS  
+//  3. AVS_INDIRECT_ACCESS  
+//
+// Only 1 method should stay in the code 
+//#define AVS_ROUND_TO_8_BITS
+//#define AVS_INDIRECT_ACCESS
+    // Move first 8x8 words of Y to dest GRF
+    mov (8)  uwDEST_Y(0)<1>     ubAVS_RESPONSE(2,1)<16;4,2>                 
+    mov (8)  uwDEST_Y(1)<1>     ubAVS_RESPONSE(2,8+1)<16;4,2>               
+    mov (8)  uwDEST_Y(2)<1>     ubAVS_RESPONSE(3,1)<16;4,2>                 
+    mov (8)  uwDEST_Y(3)<1>     ubAVS_RESPONSE(3,8+1)<16;4,2>               
+    mov (8)  uwDEST_Y(4)<1>     ubAVS_RESPONSE(8,1)<16;4,2>                 
+    mov (8)  uwDEST_Y(5)<1>     ubAVS_RESPONSE(8,8+1)<16;4,2>               
+    mov (8)  uwDEST_Y(6)<1>     ubAVS_RESPONSE(9,1)<16;4,2>                 
+    mov (8)  uwDEST_Y(7)<1>     ubAVS_RESPONSE(9,8+1)<16;4,2>               
+    // Move first 4x8 words of V to dest GRF  
+    mov (4) uwDEST_V(0)<1>      ubAVS_RESPONSE(0,1)<16;2,4>                 
+    mov (4) uwDEST_V(0,8)<1>    ubAVS_RESPONSE(0,8+1)<16;2,4>               
+    mov (4) uwDEST_V(1)<1>      ubAVS_RESPONSE(1,1)<16;2,4>                 
+    mov (4) uwDEST_V(1,8)<1>    ubAVS_RESPONSE(1,8+1)<16;2,4>               
+    mov (4) uwDEST_V(2)<1>      ubAVS_RESPONSE(6,1)<16;2,4>                 
+    mov (4) uwDEST_V(2,8)<1>    ubAVS_RESPONSE(6,8+1)<16;2,4>               
+    mov (4) uwDEST_V(3)<1>      ubAVS_RESPONSE(7,1)<16;2,4>                 
+    mov (4) uwDEST_V(3,8)<1>    ubAVS_RESPONSE(7,8+1)<16;2,4>               
+    // Move first 4x8 words of U to dest GRF        
+    mov (4) uwDEST_U(0)<1>      ubAVS_RESPONSE(4,1)<16;2,4>           
+    mov (4) uwDEST_U(0,8)<1>    ubAVS_RESPONSE(4,8+1)<16;2,4>                 
+    mov (4) uwDEST_U(1)<1>      ubAVS_RESPONSE(5,1)<16;2,4>           
+    mov (4) uwDEST_U(1,8)<1>    ubAVS_RESPONSE(5,8+1)<16;2,4>                 
+    mov (4) uwDEST_U(2)<1>      ubAVS_RESPONSE(10,1)<16;2,4>          
+    mov (4) uwDEST_U(2,8)<1>    ubAVS_RESPONSE(10,8+1)<16;2,4>                
+    mov (4) uwDEST_U(3)<1>      ubAVS_RESPONSE(11,1)<16;2,4>          
+    mov (4) uwDEST_U(3,8)<1>    ubAVS_RESPONSE(11,8+1)<16;2,4>                
+    // Move second 8x8 words of Y to dest GRF
+    mov (8) uwDEST_Y(0,8)<1>    ubAVS_RESPONSE_2(2,1)<16;4,2>    
+    mov (8) uwDEST_Y(1,8)<1>    ubAVS_RESPONSE_2(2,8+1)<16;4,2>
+    mov (8) uwDEST_Y(2,8)<1>    ubAVS_RESPONSE_2(3,1)<16;4,2>     
+    mov (8) uwDEST_Y(3,8)<1>    ubAVS_RESPONSE_2(3,8+1)<16;4,2>
+    mov (8) uwDEST_Y(4,8)<1>    ubAVS_RESPONSE_2(8,1)<16;4,2>  
+    mov (8) uwDEST_Y(5,8)<1>    ubAVS_RESPONSE_2(8,8+1)<16;4,2>
+    mov (8) uwDEST_Y(6,8)<1>    ubAVS_RESPONSE_2(9,1)<16;4,2>     
+    mov (8) uwDEST_Y(7,8)<1>    ubAVS_RESPONSE_2(9,8+1)<16;4,2>
+    // Move second 4x8 words of V to dest GRF        
+    mov (4) uwDEST_V(0,4)<1>    ubAVS_RESPONSE_2(0,1)<16;2,4>           
+    mov (4) uwDEST_V(0,12)<1>   ubAVS_RESPONSE_2(0,8+1)<16;2,4>                 
+    mov (4) uwDEST_V(1,4)<1>    ubAVS_RESPONSE_2(1,1)<16;2,4>           
+    mov (4) uwDEST_V(1,12)<1>   ubAVS_RESPONSE_2(1,8+1)<16;2,4>                 
+    mov (4) uwDEST_V(2,4)<1>    ubAVS_RESPONSE_2(6,1)<16;2,4>           
+    mov (4) uwDEST_V(2,12)<1>   ubAVS_RESPONSE_2(6,8+1)<16;2,4>                 
+    mov (4) uwDEST_V(3,4)<1>    ubAVS_RESPONSE_2(7,1)<16;2,4>           
+    mov (4) uwDEST_V(3,12)<1>   ubAVS_RESPONSE_2(7,8+1)<16;2,4>                 
+    // Move second 4x8 words of U to dest GRF        
+    mov (4) uwDEST_U(0,4)<1>    ubAVS_RESPONSE_2(4,1)<16;2,4>             
+    mov (4) uwDEST_U(0,12)<1>   ubAVS_RESPONSE_2(4,8+1)<16;2,4>           
+    mov (4) uwDEST_U(1,4)<1>    ubAVS_RESPONSE_2(5,1)<16;2,4>             
+    mov (4) uwDEST_U(1,12)<1>   ubAVS_RESPONSE_2(5,8+1)<16;2,4>           
+    mov (4) uwDEST_U(2,4)<1>    ubAVS_RESPONSE_2(10,1)<16;2,4>            
+    mov (4) uwDEST_U(2,12)<1>   ubAVS_RESPONSE_2(10,8+1)<16;2,4>          
+    mov (4) uwDEST_U(3,4)<1>    ubAVS_RESPONSE_2(11,1)<16;2,4>            
+    mov (4) uwDEST_U(3,12)<1>   ubAVS_RESPONSE_2(11,8+1)<16;2,4>          
+//------------------------------------------------------------------------------
+       // Re-define new number of lines
+       #undef nUV_NUM_OF_ROWS
+       #undef nY_NUM_OF_ROWS
+       #define nY_NUM_OF_ROWS      8
+       #define nUV_NUM_OF_ROWS     8
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_DNDI_ALG.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_DNDI_ALG.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+#define DI_ENABLE
+    #include "DNDI.inc"
+    #ifdef DI_ONLY
+		#undef  nSMPL_RESP_LEN
+		#define nSMPL_RESP_LEN          nSMPL_RESP_LEN_DI               // set the number of GRF 
+	#else
+		#undef  nSMPL_RESP_LEN
+		#define nSMPL_RESP_LEN          nSMPL_RESP_LEN_DNDI               // set the number of GRF 
+	#endif
+    #undef  nDPW_BLOCK_SIZE_HIST
+    #define nDPW_BLOCK_SIZE_HIST    nBLOCK_WIDTH_4+nBLOCK_HEIGHT_1    // HIST Block Size for Write is 4x2
+    #undef  nDPW_BLOCK_SIZE_DN
+    #define nDPW_BLOCK_SIZE_DN      nBLOCK_WIDTH_32+nBLOCK_HEIGHT_4   // DN Block Size for Write is 32x4
+////////////////////////////////////// Run the DN Algorithm ///////////////////////////////////////
+    #include "DNDI_Command.asm"
+////////////////////////////////////// Rearrange for Internal Planar //////////////////////////////
+    //// move the previous frame Y component to internal planar format
+    //$for (0; <nY_NUM_OF_ROWS/2; 1) {
+    //    mov (16) uwDEST_Y(%1,0)<1>    ubRESP(nDI_PREV_FRAME_LUMA_OFFSET,%1*16)
+    //}
+    //// move the previous frame U,V components to internal planar format
+    //$for (0; <nUV_NUM_OF_ROWS/2; 1) {
+    //    mov (8) uwDEST_U(0,%1*8)<1>   ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+    //    mov (8) uwDEST_V(0,%1*8)<1>   ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>    //V pixels
+    //}
+    //// move the current frame Y component to internal planar format
+    //$for (0; <nY_NUM_OF_ROWS/2; 1) {
+    //    mov (16) uwDEST_Y(%1+4,0)<1>  ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16)
+    //}
+    //// move the current frame U,V components to internal planar format
+    //$for (0; <nUV_NUM_OF_ROWS/2; 1) {
+    //    mov (8) uwDEST_U(2,%1*8)<1>   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+    //    mov (8) uwDEST_V(2,%1*8)<1>   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>    //V pixels
+    //}
+////////////////////////////////////// Save the STMM Data for Next Run /////////////////////////
+    // Write STMM to memory
+    shr (1)     rMSGSRC.0<1>:ud        wORIX<0;1,0>:w            1:w     NODDCLR          // X origin / 2
+    mov (1)     rMSGSRC.1<1>:ud        wORIY<0;1,0>:w                    NODDCLR_NODDCHK // Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_STMM:ud          NODDCHK         // block width and height (8x4)
+    mov (8)     mudMSGHDR_STMM(0)<1>   rMSGSRC.0<8;8,1>:ud               // message header   
+    mov (8)     mudMSGHDR_STMM(1)<1>   udRESP(nDI_STMM_OFFSET,0)         // Move STMM to MRF 
+    send (8)    dNULLREG               mMSGHDR_STMM              udDUMMY_NULL    nDATAPORT_WRITE     nDPMW_MSGDSC+nDPMW_MSG_LEN_STMM+nBI_STMM_HISTORY_OUTPUT:ud      
+////////////////////////////////////// Save the History Data for Next Run /////////////////////////
+#ifdef DI_ONLY
+#else
+    #include "DI_Hist_Save.asm"
+////////////////////////////////////// Pack and Save the DN Curr Frame for Next Run ///////////////
+    // check top/bottom field first
+	cmp.e.f0.0 (1)  null<1>:w               ubTFLD_FIRST<0;1,0>:ub     1:w
+    add (4)     pCF_Y_OFFSET<1>:uw          ubSRC_CF_OFFSET<4;4,1>:ub  npDN_YUV:uw
+	//set the save DN position
+    shl (1)     rMSGSRC.0<1>:ud      wORIX<0;1,0>:w          1:w NODDCLR           // X origin * 2
+    mov (1)     rMSGSRC.1<1>:ud      wORIY<0;1,0>:w              NODDCLR_NODDCHK   // Y origin
+    mov (1)     rMSGSRC.2<1>:ud      nDPW_BLOCK_SIZE_DN:ud       NODDCHK             // block width and height (8x4)
+    mov (8)     mudMSGHDR_DN(0)<1>   rMSGSRC.0<8;8,1>:ud
+    (f0.0) jmpi (1) TOP_FIELD_FIRST
+BOTTOM_FIELD_FIRST:
+    //$for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+    //    mov (16)    r[pCF_Y_OFFSET,  %1*32]<2>:ub     ubRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*16) // 2nd field luma from current frame (line 0,2)
+    //    mov (16)    r[pCF_Y_OFFSET,  %1+1*32]<2>:ub   ubRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,16) // 1st field luma from current frame (line 1,3)
+    //    mov (8)     r[pCF_U_OFFSET,  %1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16+1)<16;8,2> // 2nd field U from current frame (line 0,2)
+    //    mov (8)     r[pCF_V_OFFSET,  %1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16)<16;8,2> // 2nd field V from current frame (line 0,2)
+    //    mov (8)     r[pCF_U_OFFSET,  %1+1*32]<4>:ub   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,16+1)<16;8,2> // 1st field U from current frame (line 1,3)
+    //    mov (8)     r[pCF_V_OFFSET,  %1+1*32]<4>:ub   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,16)<16;8,2> // 1st field U from current frame (line 1,3)
+    //}
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (16)    r[pCF_Y_OFFSET,  %1*32]<2>:ub     ubRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*16) // 2nd field luma from current frame (line 0,2)
+        mov (16)    r[pCF_Y_OFFSET,  %1+1*32]<2>:ub   ubRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,16) // 1st field luma from current frame (line 1,3)
+    }
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (8)     r[pCF_U_OFFSET,  %1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16+1)<16;8,2> // 2nd field U from current frame (line 0,2)
+        mov (8)     r[pCF_U_OFFSET,  %1+1*32]<4>:ub   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,16+1)<16;8,2> // 1st field U from current frame (line 1,3)
+    }
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (8)     r[pCF_V_OFFSET,  %1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16)<16;8,2> // 2nd field V from current frame (line 0,2)
+        mov (8)     r[pCF_V_OFFSET,  %1+1*32]<4>:ub   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,16)<16;8,2> // 1st field U from current frame (line 1,3)
+    }
+    jmpi (1) SAVE_DN_CURR
+TOP_FIELD_FIRST:
+    //$for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+    //    mov (16)    r[pCF_Y_OFFSET,  %1*32]<2>:ub       ubRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0) // 1st field luma from current frame (line 0,2)
+    //    mov (16)    r[pCF_Y_OFFSET,  %1+1*32]<2>:ub     ubRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*16) // 2nd field luma from current frame (line 1,3)
+    //    mov (8)     r[pCF_U_OFFSET,  %1*32]<4>:ub       ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,1)<16;8,2> // 1st field U from current frame (line 0,2)
+    //    mov (8)     r[pCF_V_OFFSET,  %1*32]<4>:ub       ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,0)<16;8,2> // 1st field V from current frame (line 0,2)
+    //    mov (8)     r[pCF_U_OFFSET,  %1+1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16+1)<16;8,2> // 2nd field U from current frame (line 1,3)
+    //    mov (8)     r[pCF_V_OFFSET,  %1+1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16)<16;8,2> // 2nd field V from current frame (line 1,3)
+    //}
+	$for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (16)    r[pCF_Y_OFFSET,  %1*32]<2>:ub       ubRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0) // 1st field luma from current frame (line 0,2)
+        mov (16)    r[pCF_Y_OFFSET,  %1+1*32]<2>:ub     ubRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*16) // 2nd field luma from current frame (line 1,3)
+    }
+	$for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (8)     r[pCF_U_OFFSET,  %1*32]<4>:ub       ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,1)<16;8,2> // 1st field U from current frame (line 0,2)
+        mov (8)     r[pCF_U_OFFSET,  %1+1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16+1)<16;8,2> // 2nd field U from current frame (line 1,3)
+    }
+	$for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (8)     r[pCF_V_OFFSET,  %1*32]<4>:ub       ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,0)<16;8,2> // 1st field V from current frame (line 0,2)
+        mov (8)     r[pCF_V_OFFSET,  %1+1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16)<16;8,2> // 2nd field V from current frame (line 1,3)
+    }
+SAVE_DN_CURR:
+    $for(0; <nY_NUM_OF_ROWS/2; 1) {
+            mov (8) mudMSGHDR_DN(%1+1)<1>  udDN_YUV(%1)REGION(8,1)
+    }
+    send (8)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_PA_DN_DI+nBI_DESTINATION_YUV:ud
+#endif
+// Save Processed frames
+#include "DI_Save_PA.asm"      
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_DN_ALG.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_DN_ALG.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+#define DI_DISABLE
+#include "DNDI.inc"
+#undef  nY_NUM_OF_ROWS
+#define nY_NUM_OF_ROWS         8                                 // Number of Y rows per block
+#undef  nUV_NUM_OF_ROWS
+#define nUV_NUM_OF_ROWS        8                                 // Number of U/V rows per block
+#undef   nSMPL_RESP_LEN
+#define  nSMPL_RESP_LEN        nSMPL_RESP_LEN_DN_PA              // Set the Number of GRFs in DNDI response 
+#undef   nDPW_BLOCK_SIZE_DN
+#define  nDPW_BLOCK_SIZE_DN    nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8   // DN Curr Block Size for Write is 32x8
+#undef   nDPW_BLOCK_SIZE_HIST
+#define  nDPW_BLOCK_SIZE_HIST  nBLOCK_WIDTH_4+nBLOCK_HEIGHT_2    // HIST Block Size for Write is 4x2
+////////////////////////////////////// Run the DN Algorithm ///////////////////////////////////////
+#include "DNDI_COMMAND.asm"
+////////////////////////////////////// Save the History Data for Next Run /////////////////////////
+#include "DNDI_Hist_Save.asm"
+////////////////////////////////////// Pack and Save the DN Curr Frame for Next Run ///////////////
+add (4)     pCF_Y_OFFSET<1>:uw    ubDEST_CF_OFFSET<4;4,1>:ub    npDN_YUV:w 
+$for (0; <nY_NUM_OF_ROWS; 1) {
+    mov (16)    r[pCF_Y_OFFSET,  %1*32]<2>:ub   ubRESP(nNODI_LUMA_OFFSET,%1*16)<16;16,1>       // copy line of Y
+}
+$for (0; <nUV_NUM_OF_ROWS; 1) {
+    mov (8)     r[pCF_U_OFFSET,  %1*32]<4>:ub   ubRESP(nNODI_CHROMA_OFFSET,%1*16+1)<16;8,2>    // copy line of U
+    mov (8)     r[pCF_V_OFFSET,  %1*32]<4>:ub   ubRESP(nNODI_CHROMA_OFFSET,%1*16)<16;8,2>      // copy line of V
+}
+shl (1)     rMSGSRC.0<1>:ud     wORIX<0;1,0>:w     1:w       // X origin * 2 (422 output)
+mov (1)     rMSGSRC.1<1>:ud     wORIY<0;1,0>:w               // Y origin
+mov (1)     rMSGSRC.2<1>:ud     nDPW_BLOCK_SIZE_DN:ud        // block width and height (32x8)
+mov (8)     mMSGHDR_DN<1>:ud    rMSGSRC<8;8,1>:ud            // message header   
+$for(0; <nY_NUM_OF_ROWS; 2) {
+        mov (16) mudMSGHDR_DN(1+%1)<1>  udDN_YUV(%1)REGION(8,1)    // Move DN Curr to MRF
+}
+send (8)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_PA_DN_NODI+nBI_DESTINATION_YUV:ud     
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_Scaling.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_Scaling.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PA_Scaling.asm ----------
+#include "Scaling.inc"
+	// Build 16 elements ramp in float32 and normalized it
+//	mov (8)		SAMPLER_RAMP(0)<1>		0x76543210:v
+//	add	(8)		SAMPLER_RAMP(1)<1>		SAMPLER_RAMP(0)	8.0:f
+mov (4) SAMPLER_RAMP(0)<1> 0x48403000:vf		//3, 2, 1, 0 in float vector
+mov (4) SAMPLER_RAMP(0,4)<1> 0x5C585450:vf	//7, 6, 5, 4 in float vector
+add	(8)		SAMPLER_RAMP(1)<1>		SAMPLER_RAMP(0)	8.0:f
+//Module: PrepareScaleCoord.asm
+	// Setup for sampler msg hdr
+    mov (2)		rMSGSRC.0<1>:ud			0:ud						{ NoDDClr }	// Unused fields
+    mov (1)		rMSGSRC.2<1>:ud			0:ud						{ NoDDChk }	// Write and offset
+	// Calculate 16 v based on the step Y and vertical origin
+	mov	(16)	mfMSGPAYLOAD(2)<1>		fSRC_VID_V_ORI<0;1,0>:f
+	mov	(16)	SCALE_COORD_Y<1>:f		fSRC_VID_V_ORI<0;1,0>:f
+	// Calculate 16 u based on the step X and hori origin
+//	line (16)	mfMSGPAYLOAD(0)<1>		SCALE_STEP_X<0;1,0>:f		SAMPLER_RAMP(0) 	// Assign to mrf directly
+	mov	(16)	acc0:f							fSRC_VID_H_ORI<0;1,0>:f											{ Compr }
+	mac	(16)	mfMSGPAYLOAD(0)<1>	fVIDEO_STEP_X<0;1,0>:f	SAMPLER_RAMP(0)			{ Compr }			
+	//Setup the constants for line instruction
+	mov 	(1)		SCALE_LINE_P255<1>:f		255.0:f 			{ NoDDClr }	//{ NoDDClr, NoDDChk }
+	mov 	(1)		SCALE_LINE_P0_5<1>:f		0.5:f 				{ NoDDChk }
+//------------------------------------------------------------------------------
+$for (0; <nY_NUM_OF_ROWS; 1) {
+	// Read 16 sampled pixels and store them in float32 in 8 GRFs in the order of BGRA (VYUA).
+  mov (8) 	MSGHDR_SCALE.0:ud      rMSGSRC.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+	send (16)	SCALE_RESPONSE_YW(0)<1>		MSGHDR_SCALE	udDUMMY_NULL	nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_YUV+nBI_CURRENT_SRC_YUV
+	// Calculate 16 v for next line
+	add (16)	mfMSGPAYLOAD(2)<1>		SCALE_COORD_Y<8;8,1>:f		fVIDEO_STEP_Y<0;1,0>:f	// Assign to mrf directly
+	add (16)	SCALE_COORD_Y<1>:f		SCALE_COORD_Y<8;8,1>:f		fVIDEO_STEP_Y<0;1,0>:f	// Assign to mrf directly
+	// Scale back to [0, 255], convert f to ud
+	line (16)	acc0:f		SCALE_LINE_P255<0;1,0>:f	SCALE_RESPONSE_YF(0)	{ Compr }			// Process B, V
+	mov  (16) SCALE_RESPONSE_YD(0)<1>	acc0:f														{ Compr }
+	line (16)	acc0:f		SCALE_LINE_P255<0;1,0>:f	SCALE_RESPONSE_YF(2)	{ Compr }			// Process B, V
+	mov  (16) SCALE_RESPONSE_YD(2)<1>	acc0:f														{ Compr }
+	line (16)	acc0:f		SCALE_LINE_P255<0;1,0>:f	SCALE_RESPONSE_YF(4)	{ Compr }			// Process B, V
+	mov  (16) SCALE_RESPONSE_YD(4)<1>	acc0:f														{ Compr }
+	mov	 (16) 	DEST_V(%1)<1>				SCALE_RESPONSE_YB(0)											//possible error due to truncation - vK
+	mov	 (16) 	DEST_Y(%1)<1>				SCALE_RESPONSE_YB(2)											//possible error due to truncation - vK
+	mov	 (16) 	DEST_U(%1)<1>				SCALE_RESPONSE_YB(4)											//possible error due to truncation - vK
+}
+	#define nSRC_REGION				nREGION_1
+//------------------------------------------------------------------------------
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_16x8.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_16x8.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PL2_AVS_IEF_16x8.asm ----------
+#include "AVS_IEF.inc"
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 Y each
+// 2 sampler read for 8x8 U and 8x8 V (NV11\P208 input surface)
+//------------------------------------------------------------------------------
+    // 1st 8x8 setup
+    #include "AVS_SetupFirstBlock.asm"
+    // Enable green channel only
+    mov (1) rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud               
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(0)<1>   mAVS_8x8_HDR   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
+    // Return Y in 4 GRFs
+    // 8x8 U and V sampling 
+    // Enable red and blue channels
+    mov (1) rAVS_8x8_HDR.2:ud  nAVS_RED_BLUE_CHANNELS:ud                   
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(4)<1> mAVS_8x8_HDR_UV   udDUMMY_NULL  nSMPL_ENGINE    nAVS_MSG_DSC_2CH+nSI_SRC_UV+nBI_CURRENT_SRC_UV
+    // Return U and V in 8 GRFs
+    // 2nd 8x8 setup
+    #include "AVS_SetupSecondBlock.asm"
+    // 2nd 8x8 Y sampling
+    // Enable green channel only
+    mov (1) rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud                           
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(0)<1>    mAVS_8x8_HDR    udDUMMY_NULL    nSMPL_ENGINE    nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y 
+    // 2nd 8x8 U and V sampling 
+    // Enable red and blue channels
+    mov (1) rAVS_8x8_HDR.2:ud  nAVS_RED_BLUE_CHANNELS:ud                   
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(4)<1> mAVS_8x8_HDR_UV   udDUMMY_NULL  nSMPL_ENGINE    nAVS_MSG_DSC_2CH+nSI_SRC_UV+nBI_CURRENT_SRC_UV
+    // Return U and V in 8 GRFs
+//------------------------------------------------------------------------------
+// Unpacking sampler reads to 4:4:4 internal planar 
+//------------------------------------------------------------------------------
+    #include "PL2_AVS_IEF_Unpack_16x8.asm"
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_8x4.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_8x4.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PL2_AVS_IEF_8x4.asm ----------
+#include "AVS_IEF.inc"
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 Y each
+// 1 sampler read for 8x8 U and 8x8 V (NV11\NV12 input surface)
+//------------------------------------------------------------------------------
+    // 1st 8x8 setup
+    #include "AVS_SetupFirstBlock.asm"
+    // Enable green channel only
+    mov (1) rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud               
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(0)<1>   mAVS_8x8_HDR   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
+    // Return Y in 4 GRFs
+    // 8x8 U and V sampling 
+    // Enable red and blue channels  
+    //Only 8x4 wil be used  
+    mov (1) rAVS_8x8_HDR.2:ud  nAVS_RED_BLUE_CHANNELS:ud                   
+    // Calculate Chroma Step Size:
+    // for H direction: 16 Luma samples are covered by 8 Chroma samples. Thus Chroma_Step_X = 2 * Luma_Step_X 
+    // for V direction: 8  Luma samples are covered by 8 Chroma samples. Thus Chroma_Step_Y = Luma_Step_Y
+    mul  (1)  rAVS_PAYLOAD.1:f      fVIDEO_STEP_X:f    2.0:f             // Step X for chroma
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(4)<1> mAVS_8x8_HDR_UV   udDUMMY_NULL  nSMPL_ENGINE    nAVS_MSG_DSC_2CH+nSI_SRC_UV+nBI_CURRENT_SRC_UV
+    // Return U and V in 8 GRFs
+    // 2nd 8x8 setup
+    #include "AVS_SetupSecondBlock.asm"
+    // 2nd 8x8 Y sampling
+    // Enable green channel only
+    mov (1) rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud                           
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(0)<1>    mAVS_8x8_HDR    udDUMMY_NULL    nSMPL_ENGINE    nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y 
+//------------------------------------------------------------------------------
+// Unpacking sampler reads to 4:2:0 internal planar 
+//------------------------------------------------------------------------------
+    #include "PL2_AVS_IEF_Unpack_8x4.asm"
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_8x8.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_8x8.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PL2_AVS_IEF_8x8.asm ----------
+#include "AVS_IEF.inc"
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 Y each
+// 1 sampler read for 8x8 U and 8x8 V (NV11\NV12 input surface)
+//------------------------------------------------------------------------------
+    // 1st 8x8 setup
+    #include "AVS_SetupFirstBlock.asm"
+    // Enable green channel only
+    mov (1) rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud               
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(0)<1>   mAVS_8x8_HDR   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
+    // Return Y in 4 GRFs
+    // 8x8 U and V sampling 
+    // Enable red and blue channels    
+    mov (1) rAVS_8x8_HDR.2:ud  nAVS_RED_BLUE_CHANNELS:ud                   
+    // Calculate Chroma Step Size:
+    // for H direction: 16 Luma samples are covered by 8 Chroma samples. Thus Chroma_Step_X = 2 * Luma_Step_X 
+    // for V direction: 8  Luma samples are covered by 8 Chroma samples. Thus Chroma_Step_Y = Luma_Step_Y
+    mul  (1)  rAVS_PAYLOAD.1:f      fVIDEO_STEP_X:f    2.0:f             // Step X for chroma
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(4)<1> mAVS_8x8_HDR_UV   udDUMMY_NULL  nSMPL_ENGINE    nAVS_MSG_DSC_2CH+nSI_SRC_UV+nBI_CURRENT_SRC_UV
+    // Return U and V in 8 GRFs
+    // 2nd 8x8 setup
+    #include "AVS_SetupSecondBlock.asm"
+    // 2nd 8x8 Y sampling
+    // Enable green channel only
+    mov (1) rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud                           
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(0)<1>    mAVS_8x8_HDR    udDUMMY_NULL    nSMPL_ENGINE    nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y 
+//------------------------------------------------------------------------------
+// Unpacking sampler reads to 4:2:2 internal planar 
+//------------------------------------------------------------------------------
+    #include "PL2_AVS_IEF_Unpack_8x8.asm"
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_16x8.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_16x8.asm
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_8x4.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_8x4.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PL2_AVS_IEF_8x4.asm ----------
+    // Move first 8x8 words of Y to dest GRF at lower 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2)<1>        ubAVS_RESPONSE(%1,1)<16;4,2>      // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1)<1>      ubAVS_RESPONSE(%1,8+1)<16;4,2>    // Copy high byte in a word
+    } 
+    // Move 8x4 words of U to dest GRF  (Copy high byte in a word)
+    mov (8) uwDEST_U(0)<1>            ubAVS_RESPONSE(4,1)<16;4,2>      
+    mov (8) uwDEST_U(0,8)<1>          ubAVS_RESPONSE(5,1)<16;4,2>    
+    mov (8) uwDEST_U(1)<1>            ubAVS_RESPONSE(8,1)<16;4,2>      
+    mov (8) uwDEST_U(1,8)<1>          ubAVS_RESPONSE(9,1)<16;4,2>    
+    // Move 8x4 words of V to dest GRF  
+    mov (8) uwDEST_V(0)<1>            ubAVS_RESPONSE(6,1)<16;4,2>      
+    mov (8) uwDEST_V(0,8)<1>          ubAVS_RESPONSE(7,1)<16;4,2>    
+    mov (8) uwDEST_V(1)<1>            ubAVS_RESPONSE(10,1)<16;4,2>      
+    mov (8) uwDEST_V(1,8)<1>          ubAVS_RESPONSE(11,1)<16;4,2>    
+    // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each GRF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2,8)<1>      ubAVS_RESPONSE_2(%1,1)<16;4,2>    // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1,8)<1>    ubAVS_RESPONSE_2(%1,8+1)<16;4,2>  // Copy high byte in a word
+    } 
+//------------------------------------------------------------------------------
+    // Re-define new # of lines
+    #undef nUV_NUM_OF_ROWS
+    #undef nY_NUM_OF_ROWS
+    #define nY_NUM_OF_ROWS      8
+    #define nUV_NUM_OF_ROWS     4
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_8x8.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_8x8.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PL2_AVS_IEF_8x8.asm ----------
+    // Move first 8x8 words of Y to dest GRF at lower 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2)<1>        ubAVS_RESPONSE(%1,1)<16;4,2>      // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1)<1>      ubAVS_RESPONSE(%1,8+1)<16;4,2>    // Copy high byte in a word
+    } 
+    // Move 8x8 words of U to dest GRF  (Copy high byte in a word)
+    mov (8) uwDEST_U(0)<1>            ubAVS_RESPONSE(4,1)<16;4,2>      
+    mov (8) uwDEST_U(0,8)<1>          ubAVS_RESPONSE(4,8+1)<16;4,2>    
+    mov (8) uwDEST_U(1)<1>            ubAVS_RESPONSE(5,1)<16;4,2>      
+    mov (8) uwDEST_U(1,8)<1>          ubAVS_RESPONSE(5,8+1)<16;4,2>    
+    mov (8) uwDEST_U(2)<1>            ubAVS_RESPONSE(8,1)<16;4,2>      
+    mov (8) uwDEST_U(2,8)<1>          ubAVS_RESPONSE(8,8+1)<16;4,2>    
+    mov (8) uwDEST_U(3)<1>            ubAVS_RESPONSE(9,1)<16;4,2>      
+    mov (8) uwDEST_U(3,8)<1>          ubAVS_RESPONSE(9,8+1)<16;4,2>    
+    // Move 8x8 words of V to dest GRF  
+    mov (8) uwDEST_V(0)<1>            ubAVS_RESPONSE(6,1)<16;4,2>      
+    mov (8) uwDEST_V(0,8)<1>          ubAVS_RESPONSE(6,8+1)<16;4,2>    
+    mov (8) uwDEST_V(1)<1>            ubAVS_RESPONSE(7,1)<16;4,2>      
+    mov (8) uwDEST_V(1,8)<1>          ubAVS_RESPONSE(7,8+1)<16;4,2>    
+    mov (8) uwDEST_V(2)<1>            ubAVS_RESPONSE(10,1)<16;4,2>     
+    mov (8) uwDEST_V(2,8)<1>          ubAVS_RESPONSE(10,8+1)<16;4,2>   
+    mov (8) uwDEST_V(3)<1>            ubAVS_RESPONSE(11,1)<16;4,2>     
+    mov (8) uwDEST_V(3,8)<1>          ubAVS_RESPONSE(11,8+1)<16;4,2>   
+    // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each GRF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2,8)<1>      ubAVS_RESPONSE_2(%1,1)<16;4,2>    // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1,8)<1>    ubAVS_RESPONSE_2(%1,8+1)<16;4,2>  // Copy high byte in a word
+    } 
+//------------------------------------------------------------------------------
+    // Re-define new # of lines
+    #undef nUV_NUM_OF_ROWS
+    #undef nY_NUM_OF_ROWS
+    #define nY_NUM_OF_ROWS      8
+    #define nUV_NUM_OF_ROWS     8
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_Scaling.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_Scaling.asm
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+//---------- PL2_Scaling.asm ----------
+#include "Scaling.inc"
+	// Build 16 elements ramp in float32 and normalized it
+//	mov (8)		SAMPLER_RAMP(0)<1>		0x76543210:v
+//	add	(8)		SAMPLER_RAMP(1)<1>		SAMPLER_RAMP(0)	8.0:f
+mov (4) SAMPLER_RAMP(0)<1> 0x48403000:vf		//3, 2, 1, 0 in float vector
+mov (4) SAMPLER_RAMP(0,4)<1> 0x5C585450:vf	//7, 6, 5, 4 in float vector
+add	(8)		SAMPLER_RAMP(1)<1>		SAMPLER_RAMP(0)	8.0:f
+//Module: PrepareScaleCoord.asm
+	// Setup for sampler msg hdr
+    mov (2)		rMSGSRC.0<1>:ud			0:ud						{ NoDDClr }	// Unused fields
+    mov (1)		rMSGSRC.2<1>:ud			0:ud						{ NoDDChk }	// Write and offset
+	// Calculate 16 v based on the step Y and vertical origin
+	mov	(16)	mfMSGPAYLOAD(2)<1>		fSRC_VID_V_ORI<0;1,0>:f
+	mov	(16)	SCALE_COORD_Y<1>:f		fSRC_VID_V_ORI<0;1,0>:f
+	// Calculate 16 u based on the step X and hori origin
+//	line (16)	mfMSGPAYLOAD(0)<1>		SCALE_STEP_X<0;1,0>:f		SAMPLER_RAMP(0) 	// Assign to mrf directly
+	mov	(16)	acc0:f							fSRC_VID_H_ORI<0;1,0>:f											{ Compr }
+	mac	(16)	mfMSGPAYLOAD(0)<1>	fVIDEO_STEP_X<0;1,0>:f	SAMPLER_RAMP(0)			{ Compr }			
+	//Setup the constants for line instruction
+	mov 	(1)		SCALE_LINE_P255<1>:f		255.0:f 			{ NoDDClr }	//{ NoDDClr, NoDDChk }
+	mov 	(1)		SCALE_LINE_P0_5<1>:f		0.5:f 				{ NoDDChk }
+//------------------------------------------------------------------------------
+$for (0; <nY_NUM_OF_ROWS; 1) {
+	// Read 16 sampled pixels and store them in float32 in 8 GRFs in the order of BGRA (VYUA).
+  mov (8) 	MSGHDR_SCALE.0:ud      rMSGSRC.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+	send (16)	SCALE_RESPONSE_YW(0)<1>		MSGHDR_SCALE	udDUMMY_NULL	nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_Y+nBI_CURRENT_SRC_Y
+	send (16)	SCALE_RESPONSE_UW(0)<1>		MSGHDR_SCALE	udDUMMY_NULL	nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_UV+nBI_CURRENT_SRC_UV
+	// Calculate 16 v for next line
+	add (16)	mfMSGPAYLOAD(2)<1>		SCALE_COORD_Y<8;8,1>:f		fVIDEO_STEP_Y<0;1,0>:f	// Assign to mrf directly
+	add (16)	SCALE_COORD_Y<1>:f		SCALE_COORD_Y<8;8,1>:f		fVIDEO_STEP_Y<0;1,0>:f	// Assign to mrf directly
+	// Scale back to [0, 255], convert f to ud
+	line (16)	acc0:f		SCALE_LINE_P255<0;1,0>:f	SCALE_RESPONSE_YF(0)	{ Compr }			// Process B, V
+	mov  (16) SCALE_RESPONSE_YD(0)<1>	acc0:f														{ Compr }
+	line (16)	acc0:f		SCALE_LINE_P255<0;1,0>:f	SCALE_RESPONSE_UF(0)	{ Compr }			// Process B, V
+	mov  (16) SCALE_RESPONSE_UD(0)<1>	acc0:f														{ Compr }
+	line (16)	acc0:f		SCALE_LINE_P255<0;1,0>:f	SCALE_RESPONSE_UF(2)	{ Compr }			// Process B, V
+	mov  (16) SCALE_RESPONSE_UD(2)<1>	acc0:f														{ Compr }
+	mov	 (16) 	DEST_Y(%1)<1>				SCALE_RESPONSE_YB(0)											//possible error due to truncation - vK
+	mov	 (16) 	DEST_U(%1)<1>				SCALE_RESPONSE_UB(0)											//possible error due to truncation - vK
+	mov	 (16) 	DEST_V(%1)<1>				SCALE_RESPONSE_UB(2)											//possible error due to truncation - vK
+}
+	#define nSRC_REGION				nREGION_1
+//------------------------------------------------------------------------------
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_16x8.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_16x8.asm
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_8x4.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_8x4.asm
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_8x8.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_8x8.asm
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_16x8.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_16x8.asm
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_8x4.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_8x4.asm
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_8x8.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_8x8.asm
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_Scaling.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_Scaling.asm
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG.asm
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_NV11.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_NV11.asm
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_NV12.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_NV12.asm
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_P208.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_P208.asm
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_PL3.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_PL3.asm
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DN_ALG.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DN_ALG.asm
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_16x8.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_16x8.asm
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_Unpack_16x8.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_Unpack_16x8.asm
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_Unscramble_16x8.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_Unscramble_16x8.asm
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_Scaling.asm
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_Scaling.asm
--- a/i965_drv_video/shaders/post_processing/Core_Kernels/Scaling.inc
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/Scaling.inc
--- a/i965_drv_video/shaders/post_processing/Makefile.am
+++ b/i965_drv_video/shaders/post_processing/Makefile.am
--- a/i965_drv_video/shaders/post_processing/null.g4a
+++ b/i965_drv_video/shaders/post_processing/null.g4a
+/* Just for test */
+send(16) 0 acc0<1>UW g0<8,8,1>UW thread_spawner(0, 0, 0) mlen 1 rlen 0 {align1 EOT};
--- a/i965_drv_video/shaders/post_processing/null.g4b
+++ b/i965_drv_video/shaders/post_processing/null.g4b
+   { 0x00800031, 0x24001d28, 0x008d0000, 0x87100000 },
--- a/i965_drv_video/shaders/post_processing/null.g4b.gen5
+++ b/i965_drv_video/shaders/post_processing/null.g4b.gen5
--- a/i965_drv_video/shaders/post_processing/nv12_avs_nv12.asm
+++ b/i965_drv_video/shaders/post_processing/nv12_avs_nv12.asm
--- a/i965_drv_video/shaders/post_processing/nv12_avs_nv12.g4b.gen5
+++ b/i965_drv_video/shaders/post_processing/nv12_avs_nv12.g4b.gen5
--- a/i965_drv_video/shaders/post_processing/nv12_dndi_nv12.asm
+++ b/i965_drv_video/shaders/post_processing/nv12_dndi_nv12.asm
--- a/i965_drv_video/shaders/post_processing/nv12_dndi_nv12.g4b.gen5
+++ b/i965_drv_video/shaders/post_processing/nv12_dndi_nv12.g4b.gen5
--- a/i965_drv_video/shaders/post_processing/nv12_load_save_nv12.asm
+++ b/i965_drv_video/shaders/post_processing/nv12_load_save_nv12.asm
--- a/i965_drv_video/shaders/post_processing/nv12_load_save_nv12.g4b.gen5
+++ b/i965_drv_video/shaders/post_processing/nv12_load_save_nv12.g4b.gen5
--- a/i965_drv_video/shaders/post_processing/nv12_scaling_nv12.asm
+++ b/i965_drv_video/shaders/post_processing/nv12_scaling_nv12.asm
--- a/i965_drv_video/shaders/post_processing/nv12_scaling_nv12.g4b.gen5
+++ b/i965_drv_video/shaders/post_processing/nv12_scaling_nv12.g4b.gen5