Commit 20975a94 authored by Xiang, Haihao's avatar Xiang, Haihao

i965_drv_video: add video processing kernels

parent 5f2030ba
...@@ -163,6 +163,7 @@ AC_OUTPUT([ ...@@ -163,6 +163,7 @@ AC_OUTPUT([
i965_drv_video/shaders/mpeg2/Makefile i965_drv_video/shaders/mpeg2/Makefile
i965_drv_video/shaders/mpeg2/vld/Makefile i965_drv_video/shaders/mpeg2/vld/Makefile
i965_drv_video/shaders/render/Makefile i965_drv_video/shaders/render/Makefile
i965_drv_video/shaders/post_processing/Makefile
test/Makefile test/Makefile
test/basic/Makefile test/basic/Makefile
test/decode/Makefile test/decode/Makefile
......
SUBDIRS = h264 mpeg2 render SUBDIRS = h264 mpeg2 render post_processing
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: AYUV_Load_16x8.asm
//----------------------------------------------------------------
#include "AYUV_Load_16x8.inc"
// In order to load 64x8 AYUV data (16x8 pixels), we need to divide the data
// into two regions and load them separately.
//
// 32 byte 32 byte
//|----------------|----------------|
//| | |
//| A | B |8
//| | |
//| | |
//|----------------|----------------|
// Load the first 32x8 data block
// Packed data block should be loaded as 32x8 pixel block
add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Block origin
shl (1) rMSGSRC.0<1>:d acc0:w 2:w { NoDDClr } // H. block origin need to be four times larger
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_YUV:ud { NoDDChk } // Block width and height (32x8)
mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_YUV(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV+nBI_CURRENT_SRC_YUV:ud
//Load the second 32x8 data block
// Offset the origin X - move to next 32 colomns
add (1) rMSGSRC.0<1>:d rMSGSRC.0<0;1,0>:d 32:w // Increase X origin by 8
// Size stays the same - 32x8
mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud // Copy message description to message header
send (8) udSRC_YUV(8)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV+nBI_CURRENT_SRC_YUV:ud
// Give AYUV region addresses to address register
mov (1) SRC_YUV_OFFSET<1>:ud 0x00400038*32:ud //Address registers contain starting addresses of two halves
//Directly move the data to destination
$for(0; <nY_NUM_OF_ROWS; 1) {
mov (16) uwDEST_Y(%1)<1> r[SRC_YUV_OFFSET,%1*32+2]<8,4>:ub
mov (16) uwDEST_U(%1)<1> r[SRC_YUV_OFFSET,%1*32+1]<8,4>:ub
mov (16) uwDEST_V(%1)<1> r[SRC_YUV_OFFSET,%1*32+0]<8,4>:ub
}
\ No newline at end of file
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: AYUV_Load_16x8.inc
//
// AYUV data are first loaded to bottom I/O REGION_2, then unpacked to planar data
// and stored in top I/O REGION_1
#undef nY_NUM_OF_ROWS
#define nY_NUM_OF_ROWS 8 // Number of Y rows per block
#define nDPR_BLOCK_SIZE_YUV nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8 // Y block size 32x8
#define nDPR_MSG_SIZE_YUV nRESLEN_8 // # of MRF's to hold Y block data (8)
//Temporary storage for unpacked AYUV data
#define rUNPACK_TEMP REG(r,nTEMP0)
.declare udUNPACK_TEMP Base=rUNPACK_TEMP ElementSize=4 SrcRegion=<8;8,1> Type=ud //1 GRF
.declare ubUNPACK_TEMP Base=rUNPACK_TEMP ElementSize=1 SrcRegion=<32;32,1> Type=ub //1 GRF
.declare ubBOT_Y_IO Base=REG(r,nBOT_Y) ElementSize=1 SrcRegion=REGION(32,1) Type=ub
#define udSRC_YUV udBOT_Y_IO
#define ubSRC_YUV ubBOT_Y_IO
#define nSRC_YUV_REG nBOT_Y
#define uwDEST_Y uwTOP_Y
#define uwDEST_U uwTOP_U
#define uwDEST_V uwTOP_V
#define SRC_YUV_OFFSET a0.0
#define nSRC_REGION nREGION_1 // REGION_1 will be the source region for first kernel
// End of AYUV_Load_16x8.inc
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: Expansion.inc
// Number of U/V rows per block definition
#undef nUV_NUM_OF_ROWS
#ifdef EXPAND_9x5
#define nUV_NUM_OF_ROWS 6
#else
#define nUV_NUM_OF_ROWS 8
#endif
// Source/destination region definitions
#undef uwDEST_U
#undef uwDEST_V
#if (nSRC_REGION==nREGION_1)
#define uwDEST_U uwTOP_U
#define uwDEST_V uwTOP_V
#elif (nSRC_REGION==nREGION_2)
#define uwDEST_U uwBOT_U
#define uwDEST_V uwBOT_V
#endif
// End of Expansion.inc
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: IMC3_Load_8x4.asm
//
//----------------------------------------------------------------
#define IMC3_LOAD_8x4
#include "PL3_Load.inc"
// Load 16x8 planar Y ----------------------------------------------------------
add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin
#if !defined(LOAD_UV_ONLY)
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_Y:ud // Block width and height (16x8)
mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_Y(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
#endif
// Load 8x4 planar U and V -----------------------------------------------------
asr (2) rMSGSRC.0<1>:d rMSGSRC.0<2;2,1>:d 1:w // U/V block origin should be half of Y's
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // U/V block width and height (8x4)
mov (8) mMSGHDRU<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_U(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_U:ud
mov (8) mMSGHDRV<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_V(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_V:ud
// Convert to word-aligned format ----------------------------------------------
#if !defined(LOAD_UV_ONLY)
$for (nY_NUM_OF_ROWS-1; >-1; -1) {
mov (16) uwDEST_Y(0,%1*16)<1> ubSRC_Y(0,%1*16)
}
#endif
$for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
mov (16) uwDEST_U(0, %1*16)<1> ubSRC_U(0, %1*16)
mov (16) uwDEST_V(0, %1*16)<1> ubSRC_V(0, %1*16)
}
// End of IMC3_Load_8x4
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: IMC3_Load_8x5.asm
//
//----------------------------------------------------------------
#define IMC3_LOAD_8x5
#include "PL3_Load.inc"
// Load 16x8 planar Y ----------------------------------------------------------
add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin
#if !defined(LOAD_UV_ONLY)
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_Y:ud // Block width and height (16x8)
mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_Y(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
#endif
// Load 8x5 planar U and V -----------------------------------------------------
asr (2) rMSGSRC.0<1>:d rMSGSRC.0<2;2,1>:d 1:w // U/V block origin should be half of Y's
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // U/V block width and height (8x5)
mov (8) mMSGHDRU<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_U(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_U:ud
mov (8) mMSGHDRV<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_V(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_V:ud
// Convert to word-aligned format ----------------------------------------------
#if !defined(LOAD_UV_ONLY)
$for (nY_NUM_OF_ROWS-1; >-1; -1) {
mov (16) uwDEST_Y(0,%1*16)<1> ubSRC_Y(0,%1*16)
}
#endif
$for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
mov (16) uwDEST_U(0, %1*16)<1> ubSRC_U(0, %1*16)
mov (16) uwDEST_V(0, %1*16)<1> ubSRC_V(0, %1*16)
}
// End of IMC3_Load_8x5
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: IMC3_Load_9x5.asm
//
//----------------------------------------------------------------
// This module loads 16x8 Y, 9x5 U and 9x5 V planar data blocks for CSC module
// and stores it in byte-aligned format.
//----------------------------------------------------------------
#define IMC3_LOAD_9x5
#include "PL3_Load.inc"
// Load 16x8 planar Y ----------------------------------------------------------
add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin
#if !defined(LOAD_UV_ONLY)
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_Y:ud // Block width and height (16x8)
mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_Y(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
#endif
// Load 9x5 planar U and V -----------------------------------------------------
asr (2) rMSGSRC.0<1>:d rMSGSRC.0<2;2,1>:d 1:w // U/V block origin should be half of Y's
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // U/V block width and height (12x5)
mov (8) mMSGHDRU<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_U(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_U:ud
mov (8) mMSGHDRV<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_V(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_V:ud
// Convert to word-aligned format ----------------------------------------------
#if !defined(LOAD_UV_ONLY)
$for (nY_NUM_OF_ROWS-1; >-1; -1) {
mov (16) uwDEST_Y(0,%1*16)<1> ubSRC_Y(0,%1*16)
}
#endif
$for(nUV_NUM_OF_ROWS-2; >-1; -1) {
mov (16) uwDEST_U(0, %1*16)<1> ubSRC_U(0, %1*16)
mov (16) uwDEST_V(0, %1*16)<1> ubSRC_V(0, %1*16)
}
// End of IMC3_Load_9x5
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
#ifdef GT // to remove error messages of un-initialized GRF
.declare udGRF_space Base=r0.0 ElementSize=4 SrcRegion=REGION(8,1) Type=ud
$for (7; <80; 1) {
mov (8) udGRF_space(%1)<1> 0:ud
}
#else
#endif
\ No newline at end of file
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
/////////////////////////////////////////////////////////////////////////////////
// Multiple_Loop.asm
// This lable is for satisfying component kernel build.
// DL will remove this label and reference the real one in Multiple_Loop_Head.asm.
#if defined(COMPONENT)
VIDEO_PROCESSING_LOOP:
#endif
//===== Possible build flags for component kernels
// 1) INC_SCALING
// 2) INC_BLENDING
// 3) INC_BLENDING and INC_SCALING
// 4) (no flags)
#define MxN_MULTIPLE_BLOCKS
//------------------------------------------------------------------------------
#if defined(MxN_MULTIPLE_BLOCKS)
// Do Multiple Block Processing ------------------------------------------------
// The 1st block has been processed before entering the loop
// Processed all blocks?
add.z.f0.0 (1) wNUM_BLKS:w wNUM_BLKS:w -1:w
// Reached multi-block width?
add (1) wORIX:w wORIX:w 16:w
cmp.l.f0.1 (1) null:w acc0.0:w wFRAME_ENDX:w // acc0.0 has wORIX
#if defined(INC_SCALING)
// Update SRC_VID_H_ORI for scaling
mul (1) REG(r,nTEMP0):f fVIDEO_STEP_X:f 16.0:f
add (1) fSRC_VID_H_ORI:f REG(r,nTEMP0):f fSRC_VID_H_ORI:f
#endif
#if defined(INC_BLENDING)
// Update SRC_ALPHA_H_ORI for blending
mul (1) REG(r,nTEMP0):f fALPHA_STEP_X:f 16.0:f
add (1) fSRC_ALPHA_H_ORI:f REG(r,nTEMP0):f fSRC_ALPHA_H_ORI:f
#endif
(f0.0)jmpi (1) END_VIDEO_PROCESSING // All blocks are done - Exit loop
(f0.1)jmpi (1) VIDEO_PROCESSING_LOOP // If not the end of row, goto the beginning of the loop
//If end of row, restart Horizontal offset and calculate Vertical offsets next row.
mov (1) wORIX:w wCOPY_ORIX:w
add (1) wORIY:w wORIY:w 8:w
#if defined(INC_SCALING)
// Update SRC_VID_H_ORI and SRC_VID_V_ORI for scaling
mov (1) fSRC_VID_H_ORI:f fFRAME_VID_ORIX:f // Reset normalised X origin to 0 for video and alpha
mul (1) REG(r,nTEMP0):f fVIDEO_STEP_Y:f 8.0:f
add (1) fSRC_VID_V_ORI:f REG(r,nTEMP0):f fSRC_VID_V_ORI:f
#endif
#if defined(INC_BLENDING)
// Update SRC_ALPHA_H_ORI and SRC_ALPHA_V_ORI for blending
mov (1) fSRC_ALPHA_H_ORI:f fFRAME_ALPHA_ORIX:f // Reset normalised X origin to 0 for video and alpha
mul (1) REG(r,nTEMP0):f fALPHA_STEP_Y:f 8.0:f
add (1) fSRC_ALPHA_V_ORI:f REG(r,nTEMP0):f fSRC_ALPHA_V_ORI:f
#endif
jmpi (1) VIDEO_PROCESSING_LOOP // Continue Loop
END_VIDEO_PROCESSING:
nop
#endif
END_THREAD // End of Thread
\ No newline at end of file
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
//////////////////////////////////////////////////////////////////////////////////
// Multiple_Loop_Head.asm
// This code sets up the loop control for multiple blocks per thread
mul (1) wFRAME_ENDX:w ubBLK_CNT_X:ub 16:uw { NoDDClr } // Build multi-block loop counters
mov (1) wNUM_BLKS:w ubNUM_BLKS:ub { NoDDClr, NoDDChk } // Copy num blocks to word variable
mov (1) wCOPY_ORIX:w wORIX:w { NoDDChk } // Copy multi-block origin in pixel
mov (2) fFRAME_VID_ORIX<1>:f fSRC_VID_H_ORI<4;2,2>:f // Copy src video origin for scaling, and alpha origin for blending
add (1) wFRAME_ENDX:w wFRAME_ENDX:w wORIX:w // Continue building multi-block loop counters
VIDEO_PROCESSING_LOOP: // Loop back entry point as the biginning of the loop for multiple blocks
// Beginning of the loop
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: NV11_Load_4x8.asm
//----------------------------------------------------------------
#define NV11_LOAD_4x8
#include "PL2_Load.inc"
// Load 16x8 NV11 Y ------------------------------------------------------------
add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin
#if !defined(LOAD_UV_ONLY)
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_Y:ud // Y block width and height (16x8)
mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_Y(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
#endif
// Load 8x8 NV11 UV ----------------------------------------------------------
asr (1) rMSGSRC.0<1>:d rMSGSRC.0<0;1,0>:d 1:w // U/V block origin should be half of Y's
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // U/V block width and height (8x8)
mov (8) mMSGHDRU<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_U(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
// Convert to word-aligned format ----------------------------------------------
#if !defined(LOAD_UV_ONLY)
$for (nY_NUM_OF_ROWS-1; >-1; -1) {
mov (16) uwDEST_Y(0,%1*16)<1> ubSRC_Y(0,%1*16)
}
#endif
$for (nUV_NUM_OF_ROWS/4-1; >-1; -1) {
mov (16) uwDEST_U(0,%1*16)<1> ubSRC_U(0,%1*32)<32;16,2>
mov (16) uwDEST_V(0,%1*16)<1> ubSRC_U(0,%1*32+1)<32;16,2>
}
// End of NV11_Load_4x8
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: NV11_Load_5x8.asm
//----------------------------------------------------------------
#define NV11_LOAD_5x8
#include "PL2_Load.inc"
// Load 16x8 NV11 Y ------------------------------------------------------------
add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin
#if !defined(LOAD_UV_ONLY)
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_Y:ud // Y block width and height (16x8)
mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_Y(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
#endif
// Load 12x8 NV11 UV ---------------------------------------------------------
asr (1) rMSGSRC.0<1>:d rMSGSRC.0<0;1,0>:d 1:w // U/V block origin should be half of Y's
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // U/V block width and height (12x8)
mov (8) mMSGHDRU<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_U(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
// Convert to word-aligned format ----------------------------------------------
#if !defined(LOAD_UV_ONLY)
$for (nY_NUM_OF_ROWS-1; >-1; -1) {
mov (16) uwDEST_Y(0,%1*16)<1> ubSRC_Y(0,%1*16)
}
#endif
$for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
mov (16) uwDEST_U(0,%1*16)<1> ubSRC_U(0,%1*32)<16;8,2>
mov (16) uwDEST_V(0,%1*16)<1> ubSRC_U(0,%1*32+1)<16;8,2>
}
// End of NV11_Load_5x8
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: NV12_Load_8x4.asm
//----------------------------------------------------------------
#define NV12_LOAD_8x4
#include "PL2_Load.inc"
// Load 16x8 planar Y ----------------------------------------------------------
add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin
#if !defined(LOAD_UV_ONLY)
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_Y:ud // Y block width and height (16x8)
mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_Y(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
#endif
// Load 8x4 planar U and V -----------------------------------------------------
asr (1) rMSGSRC.1<1>:d rMSGSRC.1<0;1,0>:d 1:w // U/V block origin should be half of Y's
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // U/V block width and height (16x4)
mov (8) mMSGHDRU<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_U(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
// Convert to word-aligned format ----------------------------------------------
#if !defined(LOAD_UV_ONLY)
$for (nY_NUM_OF_ROWS-1; >-1; -1) {
mov (16) uwDEST_Y(0,%1*16)<1> ubSRC_Y(0,%1*16)
}
#endif
$for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
mov (16) uwDEST_U(0,%1*16)<1> ubSRC_U(0,%1*32)<32;16,2>
mov (16) uwDEST_V(0,%1*16)<1> ubSRC_U(0,%1*32+1)<32;16,2>
}
// End of NV12_Load_8x4
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: NV12_Load_8x5.asm
//----------------------------------------------------------------
#define NV12_LOAD_8x5
#include "PL2_Load.inc"
// Load 16x8 planar Y ----------------------------------------------------------
add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin
#if !defined(LOAD_UV_ONLY)
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_Y:ud // Y block width and height (16x8)
mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_Y(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
#endif
// Load 8x5 planar U and V -----------------------------------------------------
asr (1) rMSGSRC.1<1>:d rMSGSRC.1<0;1,0>:d 1:w // U/V block origin should be half of Y's
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // U/V block width and height (16x5)
mov (8) mMSGHDRU<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_U(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
// Convert to word-aligned format ----------------------------------------------
#if !defined(LOAD_UV_ONLY)
$for (nY_NUM_OF_ROWS-1; >-1; -1) {
mov (16) uwDEST_Y(0,%1*16)<1> ubSRC_Y(0,%1*16)
}
#endif
$for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
mov (16) uwDEST_U(0,%1*16)<1> ubSRC_U(0,%1*32)<16;8,2>
mov (16) uwDEST_V(0,%1*16)<1> ubSRC_U(0,%1*32+1)<16;8,2>
}
// End of NV12_Load_8x5
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: NV12_Load_9x5.asm
//----------------------------------------------------------------
#define NV12_LOAD_9x5
#include "PL2_Load.inc"
// Load 16x8 planar Y ----------------------------------------------------------
add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin
#if !defined(LOAD_UV_ONLY)
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_Y:ud // Y block width and height (16x8)
mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_Y(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
#endif
// Load 9x5 planar U and V -----------------------------------------------------
asr (1) rMSGSRC.1<1>:d rMSGSRC.1<0;1,0>:d 1:w // U/V block origin should be half of Y's
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // U/V block width and height (20x5)
mov (8) mMSGHDRU<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_U(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
// Convert to word-aligned format ----------------------------------------------
#if !defined(LOAD_UV_ONLY)
$for (nY_NUM_OF_ROWS-1; >-1; -1) {
mov (16) uwDEST_Y(0,%1*16)<1> ubSRC_Y(0,%1*16)
}
#endif
$for(nUV_NUM_OF_ROWS-2; >-1; -1) {
mov (16) uwDEST_U(0,%1*16)<1> ubSRC_U(0,%1*32)<16;8,2>
mov (16) uwDEST_V(0,%1*16)<1> ubSRC_U(0,%1*32+1)<16;8,2>
}
// End of NV12_Load_9x5
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: P208_Load_8x8.asm
//----------------------------------------------------------------
#define P208_LOAD_8x8
#include "PL2_Load.inc"
add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_Y:ud // Y Block width and height (16x8) (U/V block size is the same)
// Load 16x8 P208 Y ------------------------------------------------------------
#if !defined(LOAD_UV_ONLY)
mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_Y(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
#endif
// Load 16x8 planar UV -----------------------------------------------------
mov (8) mMSGHDRU<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_U(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
// Convert to word-aligned format ----------------------------------------------
#if !defined(LOAD_UV_ONLY)
$for (0; <nY_NUM_OF_ROWS; 1) {
mov (16) uwDEST_Y(0,%1*16) ubSRC_Y(0,%1*16)
}
#endif
$for (0; <nUV_NUM_OF_ROWS/2; 1) {
mov (16) uwDEST_U(0,%1*16) ubSRC_U(0,%1*32)<32;16,2>
mov (16) uwDEST_V(0,%1*16) ubSRC_U(0,%1*32+1)<32;16,2>
}
// End of P208_Load_8x8.asm
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: P208_Load_9x8.asm
//----------------------------------------------------------------
#define P208_LOAD_9x8
#include "PL2_Load.inc"
add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin
// Load 16x8 P208 Y ------------------------------------------------------------
#if !defined(LOAD_UV_ONLY)
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_Y:ud // Y block width and height (16x8)
mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_Y(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
#endif
// Load 16x8 planar UV -----------------------------------------------------
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // U/V block width and height (20x8)
mov (8) mMSGHDRU<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_U(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
// Convert to word-aligned format ----------------------------------------------
#if !defined(LOAD_UV_ONLY)
$for (0; <nY_NUM_OF_ROWS; 1) {
mov (16) uwDEST_Y(0,%1*16) ubSRC_Y(0,%1*16)
}
#endif
$for (0; <nUV_NUM_OF_ROWS; 1) {
mov (16) uwDEST_U(0,%1*16) ubSRC_U(0,%1*32)<32;16,2>
mov (16) uwDEST_V(0,%1*16) ubSRC_U(0,%1*32+1)<32;16,2>
}
// End of P208_Load_9x8.asm
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: PA_Load.inc
//
// YUV422 data are first loaded to bottom I/O REGION_2, then unpacked to planar data
// and stored in top I/O REGION_1
#undef nY_NUM_OF_ROWS
#undef nUV_NUM_OF_ROWS
#define nY_NUM_OF_ROWS 8 // Number of Y rows per block
#define nUV_NUM_OF_ROWS 8 // Number of U/V rows per block
#if defined(PA_LOAD_8x8)
#define nDPR_BLOCK_SIZE_YUV nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8 // Y block size 32x8
#define nDPR_MSG_SIZE_YUV nRESLEN_8 // # of MRF's to hold Y block data (8)
#endif
#if defined(PA_LOAD_9x8)
#define nDPR_BLOCK_SIZE_YUV_MAIN nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8 // Main YUV block size 32x8
#define nDPR_MSG_SIZE_YUV_MAIN nRESLEN_8 // # of MRF's to hold Y block data (8)
#define nDPR_BLOCK_SIZE_YUV_ADDITION nBLOCK_WIDTH_4+nBLOCK_HEIGHT_8 // Additional YUV block size 4x8
#define nDPR_MSG_SIZE_YUV_ADDITION nRESLEN_1 // # of MRF's to hold Y block data (8)
#endif
#define udSRC_YUV udBOT_Y_IO
#define nSRC_YUV_REG nBOT_Y
#define uwDEST_Y uwTOP_Y
#define uwDEST_U uwTOP_U
#define uwDEST_V uwTOP_V
#define nSRC_REGION nREGION_1 // REGION_1 will be the source region for first kernel
// End of PA_Load.inc
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: PA_Load_8x8.asm
//----------------------------------------------------------------
#define PA_LOAD_8x8
#include "PA_Load.inc"
// Load 16x8 packed data block
// Packed data block should be loaded as 32x8 pixel block
add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Block origin
shl (1) rMSGSRC.0<1>:d acc0:w 1:w // H. block origin need to be doubled
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_YUV:ud // Block width and height (32x8)
mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_YUV(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV+nBI_CURRENT_SRC_YUV:ud
// Unpack to "planar" YUV422 format in word-aligned bytes
add (4) pCF_Y_OFFSET<1>:uw ubSRC_CF_OFFSET<4;4,1>:ub nSRC_YUV_REG*nGRFWIB:w // Initial Y,U,V offset in YUV422 block
$for(0; <nY_NUM_OF_ROWS; 1) {
mov (16) uwDEST_Y(0, %1*16)<1> r[pCF_Y_OFFSET, %1*nGRFWIB]REGION(16,2)
mov (8) uwDEST_U(0, %1*8)<1> r[pCF_U_OFFSET, %1*nGRFWIB]REGION(8,4)
mov (8) uwDEST_V(0, %1*8)<1> r[pCF_V_OFFSET, %1*nGRFWIB]REGION(8,4)
}
// End of PA_Load_8x8
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: PA_Load_9x8.asm
//----------------------------------------------------------------
// This module loads 16x8 Y, 9x8 U and 9x8 V planar data blocks for CSC module
// and stores it in word-aligned format.
//----------------------------------------------------------------
#define PA_LOAD_9x8
#include "PA_Load.inc"
// Load 18x8 packed data block
// Packed data block should be loaded as 36x8 pixel block
add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Block origin
shl (1) rMSGSRC.0<1>:d acc0:w 1:w // H. block origin need to be doubled
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_YUV_MAIN:ud // Block width and height (32x8)
mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_YUV(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV_MAIN+nBI_CURRENT_SRC_YUV:ud
add (1) rMSGSRC.0<1>:d rMSGSRC.0:d 32:w //the last 4 pixels are read again for optimization
mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_YUV_ADDITION:ud // Block width and height (4x8)
mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_YUV(8)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV_ADDITION+nBI_CURRENT_SRC_YUV:ud
// Unpack to "planar" YUV422 format in word-aligned bytes
add (4) pCF_Y_OFFSET<1>:uw ubSRC_CF_OFFSET<4;4,1>:ub nSRC_YUV_REG*nGRFWIB:w // Initial Y,U,V offset in YUV422 block
$for(0; <nY_NUM_OF_ROWS; 1) {
mov (16) uwDEST_Y(0, %1*16)<1> r[pCF_Y_OFFSET, %1*nGRFWIB]REGION(16,2)
mov (8) uwDEST_U(0, %1*16)<1> r[pCF_U_OFFSET, %1*nGRFWIB]REGION(8,4)
mov (8) uwDEST_V(0, %1*16)<1> r[pCF_V_OFFSET, %1*nGRFWIB]REGION(8,4)
}
$for(0; <nUV_NUM_OF_ROWS; 1) {
mov (1) uwDEST_U(0, %1*16+8)<1> r[pCF_U_OFFSET, %1*4+256]REGION(1,0)
mov (1) uwDEST_V(0, %1*16+8)<1> r[pCF_V_OFFSET, %1*4+256]REGION(1,0)
}
//UV expansion done in PL9x8_PL16x8.asm module
// End of PA_Load_9x8
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: PL16x8_PL8x4.asm
//----------------------------------------------------------------
#include "common.inc"
#ifndef DEST_U //DEST_U, DEST_V not defined
#if (nSRC_REGION==nREGION_1)
#define DEST_Y uwTOP_Y
#define DEST_U uwTOP_U
#define DEST_V uwTOP_V
#elif (nSRC_REGION==nREGION_2)
#define DEST_Y uwBOT_Y
#define DEST_U uwBOT_U
#define DEST_V uwBOT_V
#endif
#endif
//Convert 444 from sampler to 422
$for (0, 0; <8; 2, 1) {
mov (8) DEST_U(0,%2*8)<1> DEST_U(%1)<16;8,2>
mov (8) DEST_V(0,%2*8)<1> DEST_V(%1)<16;8,2>
}
// Re-define new number of lines
#undef nUV_NUM_OF_ROWS
#define nUV_NUM_OF_ROWS 4
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: PL16x8_PL8x8.asm
//----------------------------------------------------------------
#include "common.inc"
#ifndef DEST_U
//DEST_U, DEST_V not defined
#if (nSRC_REGION==nREGION_1)
#define DEST_Y uwTOP_Y
#define DEST_U uwTOP_U
#define DEST_V uwTOP_V
#elif (nSRC_REGION==nREGION_2)
#define DEST_Y uwBOT_Y
#define DEST_U uwBOT_U
#define DEST_V uwBOT_V
#endif
#endif
//Convert 444 from sampler to 422
$for (0, 0; <8; 2, 1) {
mov DEST_U(%2)<1> DEST_U(%1)<16;8,2>
mov DEST_V(%2)<1> DEST_V(%1)<16;8,2>
}
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: PL2_Load.inc
#undef nY_NUM_OF_ROWS
#undef nUV_NUM_OF_ROWS
#define nY_NUM_OF_ROWS 8 // Number of Y rows per block
#define nDPR_BLOCK_SIZE_Y nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8 // Y block size 16x8
#define nDPR_MSG_SIZE_Y nRESLEN_4 // # of MRF's to hold Y block data (4)
#if defined(NV11_LOAD_4x8)
#define nUV_NUM_OF_ROWS 8 // Number of U/V rows per block
#define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_8+nBLOCK_HEIGHT_8 // U/V block size 8x8
#define nDPR_MSG_SIZE_UV nRESLEN_2 // # of MRF's to hold U/V block data (2)
#endif
#if defined(NV11_LOAD_5x8)
#define nUV_NUM_OF_ROWS 8 // Number of U/V rows per block
#define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_12+nBLOCK_HEIGHT_8 // U/V block size 12x8
#define nDPR_MSG_SIZE_UV nRESLEN_4 // # of MRF's to hold U/V block data (4)
#endif
#if defined(NV12_LOAD_8x4)
#define nUV_NUM_OF_ROWS 4 // Number of U/V rows per block
#define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4 // U/V block size 16x4
#define nDPR_MSG_SIZE_UV nRESLEN_2 // # of MRF's to hold U/V block data (2)
#endif
#if defined(NV12_LOAD_8x5)
#define nUV_NUM_OF_ROWS 6 // Number of U/V rows per block (Rounded Up to Nearest Even Number)
#define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_16+nBLOCK_HEIGHT_5 // U/V block size 16x5
#define nDPR_MSG_SIZE_UV nRESLEN_3 // # of MRF's to hold U/V block data (3)
#endif
#if defined(NV12_LOAD_9x5)
#define nUV_NUM_OF_ROWS 6 // Number of U/V rows per block (Rounded Up to Nearest Even Number)
#define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_20+nBLOCK_HEIGHT_5 // U/V block size 20x5
#define nDPR_MSG_SIZE_UV nRESLEN_5 // # of MRF's to hold U/V block data (5)
#endif
#if defined(P208_LOAD_8x8)
#define nUV_NUM_OF_ROWS 8 // Number of U/V rows per block
#define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8 // U/V block size 16x8
#define nDPR_MSG_SIZE_UV nRESLEN_4 // # of MRF's to hold U/V block data (4)
#endif
#if defined(P208_LOAD_9x8)
#define nUV_NUM_OF_ROWS 8 // Number of U/V rows per block
#define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_20+nBLOCK_HEIGHT_8 // U/V block size 20x8
#define nDPR_MSG_SIZE_UV nRESLEN_8 // # of MRF's to hold U/V block data (8)
#endif
// Source/destination region definitions
#if !defined(udSRC_Y)
#define udSRC_Y udBOT_Y_IO // Default Y source region is top Y region
#endif
#if !defined(udSRC_U)
#define udSRC_U udBOT_U_IO // Default U source region is top U region
#endif
#define ubSRC_Y ubBOT_Y
#define nSRC_Y_REG nBOT_Y
#define ubSRC_U ubBOT_U
#define nSRC_U_REG nBOT_U
#define uwDEST_Y uwTOP_Y // However they can be transferred to word-aligned byte if desired
#define uwDEST_U uwTOP_U
#define uwDEST_V uwTOP_V
#define nSRC_REGION nREGION_1 // REGION_1 will be the source region for first kernel
// End of PL2_Load.inc
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: PL3_Load.inc
#undef nY_NUM_OF_ROWS
#undef nUV_NUM_OF_ROWS
#define nY_NUM_OF_ROWS 8 // Number of Y rows per block
#define nDPR_BLOCK_SIZE_Y nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8 // Y block size 16x8
#define nDPR_MSG_SIZE_Y nRESLEN_4 // # of MRF's to hold Y block data (4)
#if defined(IMC3_LOAD_8x4)
#define nUV_NUM_OF_ROWS 4 // Number of U/V rows per block
#define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_8+nBLOCK_HEIGHT_4 // U/V block size 8x4
#define nDPR_MSG_SIZE_UV nRESLEN_1 // # of MRF's to hold U/V block data (1)
#endif
#if defined(IMC3_LOAD_8x5)
#define nUV_NUM_OF_ROWS 6 // Number of U/V rows per block (Rounded Up to Nearest Even Number)
#define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_8+nBLOCK_HEIGHT_5 // U/V block size 8x5
#define nDPR_MSG_SIZE_UV nRESLEN_2 // # of MRF's to hold U/V block data (2)
#endif
#if defined(IMC3_LOAD_9x5)
#define nUV_NUM_OF_ROWS 6 // Number of U/V rows per block (Rounded Up to Nearest Even Number)
#define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_12+nBLOCK_HEIGHT_5 // U/V block size 12x5
#define nDPR_MSG_SIZE_UV nRESLEN_3 // # of MRF's to hold U/V block data (3)
#endif
// Source/destination region definitions
#if !defined(udSRC_Y)
#define udSRC_Y udBOT_Y_IO // Default Y source region is top Y region
#endif
#if !defined(udSRC_U)
#define udSRC_U udBOT_U_IO // Default U source region is top U region
#endif
#if !defined(udSRC_V)
#define udSRC_V udBOT_V_IO // Default V source region is top V region
#endif
#define ubSRC_Y ubBOT_Y // Loading data are always in byte type
#define ubSRC_U ubBOT_U
#define ubSRC_V ubBOT_V
#define uwDEST_Y uwTOP_Y // However they can be transferred to word-aligned byte if desired
#define uwDEST_U uwTOP_U
#define uwDEST_V uwTOP_V
#define nSRC_REGION nREGION_1 // REGION_1 will be the source region for first kernel
// End of PL3_Load.inc
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
#include "PL4x8_Save_NV11.inc"
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
#if !defined(SAVE_UV_ONLY)
// Save current planar frame Y block data (16x8) -------------------------------
mov (2) mMSGHDR.0<1>:d wORIX<2;2,1>:w // Block origin
mov (1) mMSGHDR.2<1>:ud nDPW_BLOCK_SIZE_Y:ud // Block width and height (16x8)
///* Yoni - masking is not relevant for ILK?!?
//
// //Use the mask to determine which pixels shouldn't be over-written
// cmp.ge.f0.0 (1) NULLREG BLOCK_MASK_D:ud 0x00FFFFFF:ud //Check if all pixels in the block need to be modified
// (f0.0) jmpi WritePlanarToDataPort
//
// //If mask is not all 1's, then load the entire 16x8 block
// //so that only those bytes may be modified that need to be (using the mask)
// send (8) SRC_YD(0)<1> MSGHDR MSGSRC<8;8,1>:ud DWBRMSGDSC+0x00040000+BI_DEST_Y:ud //16x8
//
// asr (2) MSGSRC.0<1>:ud ORIX<2;2,1>:w 1:w // U/V block origin should be half of Y's
// mov (1) MSGSRC.2<1>:ud 0x00030007:ud // Block width and height (8x4)
// send (8) SRC_UD(0)<1> MSGHDR MSGSRC<8;8,1>:ud DWBRMSGDSC+0x00010000+BI_DEST_U:ud
// send (8) SRC_VD(0)<1> MSGHDR MSGSRC<8;8,1>:ud DWBRMSGDSC+0x00010000+BI_DEST_V:ud
//
// //Restore the origin information
// mov (2) MSGSRC.0<1>:ud ORIX<2;2,1>:w // Block origin
// mov (1) MSGSRC.2<1>:ud 0x0007000F:ud // Block width and height (16x8)
//
// //expand U and V to be aligned on word boundary
// mov (16) SRC_UW(1)<1> SRC_U(0,16)
// mov (16) SRC_UW(0)<1> SRC_U(0, 0)
// mov (16) SRC_VW(1)<1> SRC_V(0,16)
// mov (16) SRC_VW(0)<1> SRC_V(0, 0)
//
// //Merge the data
// mov (1) f0.1:uw BLOCK_MASK_V:uw //Load the mask on flag reg
// (f0.1) mov (8) TEMP0<1>:uw BLOCK_MASK_H:uw
// (-f0.1) mov (8) TEMP0<1>:uw 0:uw
//
// // Destination is Word aligned
// $for(0; <Y_ROW_SIZE; 2) {
// mov (1) f0.1:uw TEMP(0,%1)<0;1,0>
// (-f0.1) mov (16) DEST_Y(0, %1*32)<2> SRC_Y(0, %1*16)
// (-f0.1) mov (16) DEST_U(0, %1*8)<1> SRC_U(0, %1*8) //only works for Word aligned Byte data
// (-f0.1) mov (16) DEST_V(0, %1*8)<1> SRC_V(0, %1*8) //only works for Word aligned Byte data
//
// mov (1) f0.1:uw TEMP(0,1+%1)<0;1,0>
// (-f0.1) mov (16) DEST_Y(0, 1+%1*32)<2> SRC_Y(0, 1+%1*16)
//
// }
//
//*/ Yoni - masking is not relevant for ILK?!?
WritePlanarToDataPort:
$for(0,0; <nY_NUM_OF_ROWS; 2,1) {
mov (16) mubMSGPAYLOAD(%2,0)<1> ub2DEST_Y(%1)REGION(16,2)
mov (16) mubMSGPAYLOAD(%2,16)<1> ub2DEST_Y(%1+1)REGION(16,2)
}
send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_Y+nBI_DESTINATION_Y:ud
#endif
// Save U/V data block in planar format (4x8) ----------------------------------
mov (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w // Block origin
asr (1) rMSGSRC.0<1>:d rMSGSRC.0<0;1,0>:d 1:w // U/V block origin should be half of Y's
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
$for(0,0; <nY_NUM_OF_ROWS;4,1) {
mov (16) mubMSGPAYLOAD(%2,0)<2> ub2DEST_U(%2)REGION(16,2)
mov (16) mubMSGPAYLOAD(%2,1)<2> ub2DEST_V(%2)REGION(16,2)
}
send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_UV+nBI_DESTINATION_UV:ud
// End of PL4x8_Save_NV11
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
//Module name: PL8x4_Save_NV11.inc
//
// Setup for storing planar data
//
#include "undefall.inc" //Undefine the SRC and DEST sysmbols
#define nDPW_BLOCK_SIZE_Y nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8 // Y block size 16x8
#define nDPW_MSG_SIZE_Y nMSGLEN_4 // # of MRF's to hold Y block data (4)
#define nDPW_BLOCK_SIZE_UV nBLOCK_WIDTH_8+nBLOCK_HEIGHT_8 // U/V interleaved block width and height (8x8)
#define nDPW_MSG_SIZE_UV nMSGLEN_2 // # of MRF's to hold U/V block data (2)
#if (nSRC_REGION==nREGION_1)
#define udSRC_Y udBOT_Y_IO
#define udSRC_U udBOT_U_IO
#define udSRC_V udBOT_V_IO
#define ubSRC_Y ubBOT_Y
#define ubSRC_U ubBOT_U
#define ubSRC_V ubBOT_V
#define uwSRC_U uwBOT_U //For masking operation
#define uwSRC_V uwBOT_V
#define ub2DEST_Y ub2TOP_Y
#define ub2DEST_U ub2TOP_U
#define ub2DEST_V ub2TOP_V
#elif (nSRC_REGION==nREGION_2)
#define udSRC_Y udTOP_Y_IO
#define udSRC_U udTOP_U_IO
#define udSRC_V udTOP_V_IO
#define ubSRC_Y ubTOP_Y
#define ubSRC_U ubTOP_U
#define ubSRC_V ubTOP_V
#define uwSRC_U uwTOP_U //For masking operation
#define uwSRC_V uwTOP_V
#define ub2DEST_Y ub2BOT_Y
#define ub2DEST_U ub2BOT_U
#define ub2DEST_V ub2BOT_V
#endif
///* Yoni - masking is not relevant for ILK?!?
//#define TEMP0 REG(r,54)
//.declare TEMP Base=TEMP0 ElementSize=2 SrcRegion=<8;8,1> Type=uw
///* Yoni - masking is not relevant for ILK?!?
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: PL5x8_PL16x8.asm
#include "Expansion.inc"
//------------------------------ Horizontal Upconversion -----------------------------
$for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
avg.sat (16) uwDEST_U(0, %1*32+16) uwDEST_U(0, %1*16+7)<1;2,0> uwDEST_U(0, %1*16+7)<1;2,1>
avg.sat (16) uwDEST_V(0, %1*32+16) uwDEST_V(0, %1*16+7)<1;2,0> uwDEST_V(0, %1*16+7)<1;2,1>
avg.sat (16) uwDEST_U(0, %1*32) uwDEST_U(0, %1*16)<1;2,0> uwDEST_U(0, %1*16)<1;2,1>
avg.sat (16) uwDEST_V(0, %1*32) uwDEST_V(0, %1*16)<1;2,0> uwDEST_V(0, %1*16)<1;2,1>
}
$for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
avg.sat (16) uwDEST_U(0, %1*32+16) uwDEST_U(0, %1*32+18)<1;2,0> uwDEST_U(0, %1*32+18)<1;2,1>
avg.sat (16) uwDEST_V(0, %1*32+16) uwDEST_V(0, %1*32+18)<1;2,0> uwDEST_V(0, %1*32+18)<1;2,1>
avg.sat (16) uwDEST_U(0, %1*32) uwDEST_U(0, %1*32)<1;2,0> uwDEST_U(0, %1*32)<1;2,1>
avg.sat (16) uwDEST_V(0, %1*32) uwDEST_V(0, %1*32)<1;2,0> uwDEST_V(0, %1*32)<1;2,1>
}
// End of PL5x8_PL16x8
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: PL5x8_PL8x8.asm
#include "Expansion.inc"
//------------------------------ Horizontal Upconversion -----------------------------
$for (0; <nUV_NUM_OF_ROWS; 1) {
avg.sat (8) uwDEST_U(0, %1*8) uwDEST_U(0, %1*8)<1;2,0> uwDEST_U(0, %1*8)<1;2,1>
avg.sat (8) uwDEST_V(0, %1*8) uwDEST_V(0, %1*8)<1;2,0> uwDEST_V(0, %1*8)<1;2,1>
}
// End of PL5x8_PL8x8
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: PL8x4_Save_IMC3.asm
//
// Save planar YUV420 frame data block of size 16x8
#include "PL8x4_Save_IMC3.inc"
//Use the mask to determine which pixels shouldn't be over-written
and (1) acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud 0x00FFFFFF:ud
cmp.ge.f0.0(1) dNULLREG acc0.0<0;1,0>:ud 0x00FFFFFF:ud //Check if all pixels in the block need to be modified
(f0.0) jmpi WritePlanarToDataPort
//If mask is not all 1's, then load the entire 16x8 block
//so that only those bytes may be modified that need to be (using the mask)
// Load 16x8 planar Y ----------------------------------------------------------
mov (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w // Block origin
mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_Y:ud // Block width and height (16x8)
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_Y(0)<1> mMSGHDR udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_DESTINATION_Y:ud
// Load 8x4 planar U and V -----------------------------------------------------
asr (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w 1:w // U/V block origin should be half of Y's
mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_UV:ud // Block width and height (8x4)
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_U(0)<1> mMSGHDR udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_DESTINATION_U:ud
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
send (8) udSRC_V(0)<1> mMSGHDR udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_DESTINATION_V:ud
//expand U and V to be aligned on word boundary - Y remains in bytes
$for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
mov (16) uwSRC_U(0, %1*16)<1> ubSRC_U(0, %1*16)
mov (16) uwSRC_V(0, %1*16)<1> ubSRC_V(0, %1*16)
}
//Merge the data
mov (1) f0.0:uw ubBLOCK_MASK_V:ub //Load the mask on flag reg
(f0.0) mov (8) rMASK_TEMP<1>:uw uwBLOCK_MASK_H:uw
(-f0.0) mov (8) rMASK_TEMP<1>:uw 0:uw
// Destination is Word aligned
$for(0; <nY_NUM_OF_ROWS; 2) {
mov (1) f0.1:uw uwMASK_TEMP(0,%1)<0;1,0>
(-f0.1) mov (16) ub2DEST_Y(0, %1*32)<2> ubSRC_Y(0, %1*16)
(-f0.1) mov (16) ub2DEST_U(0, %1*8)<1> ubSRC_U(0, %1*8) //only works for Word aligned Byte data
(-f0.1) mov (16) ub2DEST_V(0, %1*8)<1> ubSRC_V(0, %1*8) //only works for Word aligned Byte data
mov (1) f0.1:uw uwMASK_TEMP(0,1+%1)<0;1,0>
(-f0.1) mov (16) ub2DEST_Y(0, 1+%1*32)<2> ubSRC_Y(0, 1+%1*16)
}
WritePlanarToDataPort:
#if !defined(SAVE_UV_ONLY)
// Save current planar frame Y block data (16x8) -------------------------------
mov (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w // Block origin
mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_Y:ud // Block width and height (16x8)
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
$for(0,0; <nY_NUM_OF_ROWS; 2,1) {
mov(16) mubMSGPAYLOAD(%2,0)<1> ub2DEST_Y(%1)REGION(16,2)
mov(16) mubMSGPAYLOAD(%2,16)<1> ub2DEST_Y(%1+1)REGION(16,2)
}
send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_Y+nBI_DESTINATION_Y:ud
#endif
// Save U/V data block in planar format (8x4) ----------------------------------
asr (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w 1:w // U/V block origin should be half of Y's
mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_UV:ud // Block width and height (8x4)
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
// Save U picture data ---------------------------------------------------------
mov (16) mubMSGPAYLOAD(0,0)<1> ub2DEST_U(0)REGION(16,2) // U rows 0,1
mov (16) mubMSGPAYLOAD(0,16)<1> ub2DEST_U(1)REGION(16,2) // U rows 2,3
send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_UV+nBI_DESTINATION_U:ud
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
// Save V picture data ---------------------------------------------------------
mov (16) mubMSGPAYLOAD(0,0)<1> ub2DEST_V(0)REGION(16,2) // V rows 0,1
mov (16) mubMSGPAYLOAD(0,16)<1> ub2DEST_V(1)REGION(16,2) // V rows 2,3
send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_UV+nBI_DESTINATION_V:ud
// End of PL8x4_Save_IMC3
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: PL8x4_Save_IMC3.inc
//
// Setup for storing planar data
//
#include "undefall.inc" //Undefine the SRC and DEST sysmbols
// For saving
#define nDPW_BLOCK_SIZE_Y nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8 // Y block size 16x8
#define nDPW_MSG_SIZE_Y nMSGLEN_4 // # of MRF's to hold Y block data (4)
#define nDPW_BLOCK_SIZE_UV nBLOCK_WIDTH_8+nBLOCK_HEIGHT_4 // U/V block size 8x4
#define nDPW_MSG_SIZE_UV nMSGLEN_1 // # of MRF's to hold U/V block data (1)
// For masking
#undef nDPR_MSG_SIZE_Y
#define nDPR_MSG_SIZE_Y nRESLEN_4 // # of MRF's to hold Y block data (4)
#undef nDPR_MSG_SIZE_UV
#define nDPR_MSG_SIZE_UV nRESLEN_1 // # of MRF's to hold U/V block data (1)
#define rMASK_TEMP REG(r,nTEMP0)
.declare uwMASK_TEMP Base=rMASK_TEMP ElementSize=2 SrcRegion=<8;8,1> Type=uw //1 GRF
#if (nSRC_REGION==nREGION_1)
// For saving
#define ub2DEST_Y ub2TOP_Y
#define ub2DEST_U ub2TOP_U
#define ub2DEST_V ub2TOP_V
//For masking operation
#define udSRC_Y udBOT_Y_IO
#define udSRC_U udBOT_U_IO
#define udSRC_V udBOT_V_IO
#define ubSRC_Y ubBOT_Y
#define ubSRC_U ubBOT_U
#define ubSRC_V ubBOT_V
#define uwSRC_U uwBOT_U
#define uwSRC_V uwBOT_V
#elif (nSRC_REGION==nREGION_2)
// For saving
#define ub2DEST_Y ub2BOT_Y
#define ub2DEST_U ub2BOT_U
#define ub2DEST_V ub2BOT_V
//For masking operation
#define udSRC_Y udTOP_Y_IO
#define udSRC_U udTOP_U_IO
#define udSRC_V udTOP_V_IO
#define ubSRC_Y ubTOP_Y
#define ubSRC_U ubTOP_U
#define ubSRC_V ubTOP_V
#define uwSRC_U uwTOP_U
#define uwSRC_V uwTOP_V
#endif
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: PL8x4_Save_NV12.asm
//
// Save entire current planar frame data block of size 16x8
//---------------------------------------------------------------
// Symbols needed to be defined before including this module
//
// DWORD_ALIGNED_DEST: only if DEST_Y, DEST_U, DEST_V data are DWord aligned
// ORIX:
//---------------------------------------------------------------
#include "PL8x4_Save_NV12.inc"
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
#if !defined(SAVE_UV_ONLY)
// Save current planar frame Y block data (16x8) -------------------------------
mov (2) mMSGHDR.0<1>:d wORIX<2;2,1>:w // Block origin
mov (1) mMSGHDR.2<1>:ud nDPW_BLOCK_SIZE_Y:ud // Block width and height (16x8)
#endif
//Use the mask to determine which pixels shouldn't be over-written
and (1) acc0<1>:ud udBLOCK_MASK<0;1,0>:ud 0x00FFFFFF:ud
cmp.ge.f0.0 (1) dNULLREG acc0<0;1,0>:ud 0x00FFFFFF:ud //Check if all pixels in the block need to be modified
(f0.0) jmpi WritePlanarToDataPort
//If mask is not all 1's, then load the entire 16x8 block
//so that only those bytes may be modified that need to be (using the mask)
send (8) udSRC_Y(0)<1> mMSGHDR udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_DESTINATION_Y:ud //16x8
asr (1) rMSGSRC.1<1>:ud wORIY<0;1,0>:w 1:w { NoDDClr } // U/V block origin should be half of Y's
mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_UV:ud { NoDDChk } // Block width and height (16x4)
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud //move message desrcptor to the message header
send (8) udSRC_U(0)<1> mMSGHDR udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_DESTINATION_UV:ud
//Restore the origin information
mov (2) rMSGSRC.0<1>:ud wORIX<2;2,1>:w // Block origin
mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_Y:ud // Block width and height (16x8)
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud //move message desrcptor to the message header
//Merge the data
mov (1) f0.1:uw ubBLOCK_MASK_V:ub //Load the mask on flag reg
(f0.1) mov (8) rMASK_TEMP<1>:uw uwBLOCK_MASK_H:uw
(-f0.1) mov (8) rMASK_TEMP<1>:uw 0:uw
//convert the mask from 16bits to 8bits by selecting every other bit
mov (1) udMASK_TEMP1(0,0)<1> 0x00040001:ud
mov (1) udMASK_TEMP1(0,1)<1> 0x00400010:ud
mov (1) udMASK_TEMP1(0,2)<1> 0x04000100:ud
mov (1) udMASK_TEMP1(0,3)<1> 0x40001000:ud
//merge the loaded block with the current block
$for(0,0; <nY_NUM_OF_ROWS; 2,1) {
mov (1) f0.1:uw uwMASK_TEMP(0, %1)<0;1,0>
(-f0.1) mov (16) ubDEST_Y(0,%1*32)<2> ubSRC_Y(0,%1*16)
and.nz.f0.1 (8) wNULLREG uwMASK_TEMP(0,%1)<0;1,0> uwMASK_TEMP1(0,0) //change the mask by selecting every other bit
(-f0.1) mov (8) ubDEST_U(0, %2*16)<2> ub2SRC_U(0, %1*8)<16;8,2>
(-f0.1) mov (8) ubDEST_V(0, %2*16)<2> ub2SRC_U(0, %1*8+1)<16;8,2>
mov (1) f0.1:uw uwMASK_TEMP(0,1+%1)<0;1,0>
(-f0.1) mov (16) ubDEST_Y(0, (1+%1)*32)<2> ubSRC_Y(0, (1+%1)*16)
}
WritePlanarToDataPort:
#if !defined(SAVE_UV_ONLY)
$for(0,0; <nY_NUM_OF_ROWS; 2,1) {
mov (16) mubMSGPAYLOAD(%2,0)<1> ub2DEST_Y(%1)REGION(16,2)
mov (16) mubMSGPAYLOAD(%2,16)<1> ub2DEST_Y(%1+1)REGION(16,2)
}
send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_Y+nBI_DESTINATION_Y:ud
#endif
//** Save 8x4 packed U and V -----------------------------------------------------
// we could write directly wORIX to mMSGHDR and then execute asr on it, that way we could
// avoid using rMSGSRC as a buffer and have one command less in code, but it is unknown whether
//it is possible to do asr on mMSGHDR so we use rMSGSRC.
mov (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w // Block origin
asr (1) rMSGSRC.1<1>:d rMSGSRC.1<0;1,0>:d 1:w // U/V block origin should be half of Y's
mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_UV:ud // U/V block width and height (16x4)
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
$for(0,0; <nY_NUM_OF_ROWS;4,1) {
mov (16) mubMSGPAYLOAD(%2,0)<2> ub2DEST_U(%2)REGION(16,2)
mov (16) mubMSGPAYLOAD(%2,1)<2> ub2DEST_V(%2)REGION(16,2)
}
send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_UV+nBI_DESTINATION_UV:ud
// End of PL8x4_Save_NV12
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
//Module name: PL8x4_Save_NV12.inc
//
// Setup for storing planar data
//
#include "undefall.inc" //Undefine the SRC and DEST sysmbols
#undef nDPW_BLOCK_SIZE_Y
#undef nDPW_MSG_SIZE_Y
#undef nDPW_BLOCK_SIZE_UV
#undef nDPW_MSG_SIZE_UV
#define nDPW_BLOCK_SIZE_Y nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8 // Y block size 16x8
#define nDPW_MSG_SIZE_Y nMSGLEN_4 // # of MRF's to hold Y block data (4)
#define nDPW_BLOCK_SIZE_UV nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4 // U/V interleaved block width and height (16x4)
#define nDPW_MSG_SIZE_UV nMSGLEN_2 // # of MRF's to hold U/V block data (2)
// For masking
#undef nDPR_MSG_SIZE_Y
#define nDPR_MSG_SIZE_Y nRESLEN_4 // # of MRF's to hold Y block data (4)
#undef nDPR_MSG_SIZE_UV
#define nDPR_MSG_SIZE_UV nRESLEN_2
#define rMASK_TEMP REG(r,nTEMP0)
.declare uwMASK_TEMP Base=rMASK_TEMP ElementSize=2 SrcRegion=<8;8,1> Type=uw //1 GRF
#define rMASK_TEMP1 REG(r,nTEMP1)
.declare udMASK_TEMP1 Base=rMASK_TEMP1 ElementSize=4 SrcRegion=<4;4,1> Type=ud //1 GRF
.declare uwMASK_TEMP1 Base=rMASK_TEMP1 ElementSize=2 SrcRegion=<8;8,1> Type=uw //1 GRF
#if (nSRC_REGION==nREGION_1)
#define udSRC_Y udBOT_Y_IO
#define udSRC_U udBOT_U_IO
#define udSRC_V udBOT_V_IO
#define ubSRC_Y ubBOT_Y
#define ubSRC_U ubBOT_U
#define ubSRC_V ubBOT_V
#define uwSRC_U uwBOT_U //For masking operation
#define uwSRC_V uwBOT_V
#define ub2DEST_Y ub2TOP_Y
#define ub2DEST_U ub2TOP_U
#define ub2DEST_V ub2TOP_V
#define ubDEST_Y ubTOP_Y
#define ubDEST_U ubTOP_U
#define ubDEST_V ubTOP_V
#define ub2SRC_U ub2BOT_U
#elif (nSRC_REGION==nREGION_2)
#define udSRC_Y udTOP_Y_IO
#define udSRC_U udTOP_U_IO
#define udSRC_V udTOP_V_IO
#define ubSRC_Y ubTOP_Y
#define ubSRC_U ubTOP_U
#define ubSRC_V ubTOP_V
#define uwSRC_U uwTOP_U //For masking operation
#define uwSRC_V uwTOP_V
#define ub2DEST_Y ub2BOT_Y
#define ub2DEST_U ub2BOT_U
#define ub2DEST_V ub2BOT_V
#define ubDEST_Y ubBOT_Y
#define ubDEST_U ubBOT_U
#define ubDEST_V ubBOT_V
#define ub2SRC_U ub2TOP_U
#endif
///* Yoni - masking is not relevant for ILK?!?
//#define TEMP0 REG(r,54)
//.declare TEMP Base=TEMP0 ElementSize=2 SrcRegion=<8;8,1> Type=uw
///* Yoni - masking is not relevant for ILK?!?
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: PL8x5_PL8x8.asm
#include "Expansion.inc"
//------------------------------- Vertical Upconversion ------------------------------
avg.sat (8) uwDEST_U(0, 3*16+8)<1> uwDEST_U(0, 3*8) uwDEST_U(0, (1+3)*8) // Optimization
avg.sat (8) uwDEST_V(0, 3*16+8)<1> uwDEST_V(0, 3*8) uwDEST_V(0, (1+3)*8) // Optimization
$for(nUV_NUM_OF_ROWS/2-2; >-1; -1) {
mov (8) uwDEST_U(0, (1+%1)*16)<1> uwDEST_U(0, (1+%1)*8)
avg.sat (8) uwDEST_U(0, %1*16+8)<1> uwDEST_U(0, %1*8) uwDEST_U(0, (1+%1)*8)
mov (8) uwDEST_V(0, (1+%1)*16)<1> uwDEST_V(0, (1+%1)*8)
avg.sat (8) uwDEST_V(0, %1*16+8)<1> uwDEST_V(0, %1*8) uwDEST_V(0, (1+%1)*8)
}
// End of PL8x5_PL8x8
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: PL8x8_PL8x4.asm
//
// Convert PL 8x8 to PL8x4 in GRF
//---------------------------------------------------------------
// Symbols needed to be defined before including this module
//
// DWORD_ALIGNED_DEST: only if DEST_Y, DEST_U, DEST_V data are DWord aligned
// ORIX:
//---------------------------------------------------------------
#include "PL8x8_PL8x4.inc"
// Convert PL8x8 to PL8x4 ---------------------------------------------------------
mov (8) ubDEST_U(0,16)<2> ubDEST_U(1)<16;8,2> //selecting U every other row
mov (16) ubDEST_U(0,32)<2> ubDEST_U(2)<32;8,2> //selecting U every other row
mov (8) ubDEST_V(0,16)<2> ubDEST_V(1)<16;8,2> //selecting V every other row
mov (16) ubDEST_V(0,32)<2> ubDEST_V(2)<32;8,2> //selecting V every other row
// End of PL8x8_PL8x4.asm -------------------------------------------------------
\ No newline at end of file
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: PL8x8_PL8x4.inc
//
// Setup module for convert PL8x8 to PL8x4
//
//
// Source/destination region definitions
//
#include "undefall.inc" //Undefine the SRC and DEST sysmbols
#if (nSRC_REGION==nREGION_1)
//REGION_1 selected
#define ubDEST_Y ubTOP_Y
#define ubDEST_U ubTOP_U
#define ubDEST_V ubTOP_V
#elif (nSRC_REGION==nREGION_2)
//REGION_2 selected
#define ubDEST_Y ubBOT_Y
#define ubDEST_U ubBOT_U
#define ubDEST_V ubBOT_V
#endif
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: PL8x8_Save_P208.asm
//
// Save entire current planar frame data block of size 16x8
//---------------------------------------------------------------
// Symbols needed to be defined before including this module
//
// DWORD_ALIGNED_DEST: only if DEST_Y, DEST_U, DEST_V data are DWord aligned
// ORIX:
//---------------------------------------------------------------
#include "PL8x8_Save_P208.inc"
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
#if !defined(SAVE_UV_ONLY)
// Save current planar frame Y block data (16x8) -------------------------------
mov (2) mMSGHDR.0<1>:d wORIX<2;2,1>:w // Block origin
mov (1) mMSGHDR.2<1>:ud nDPW_BLOCK_SIZE_Y:ud // Block width and height (16x8)
WritePlanarToDataPort:
$for(0,0; <nY_NUM_OF_ROWS; 2,1) {
mov (16) mubMSGPAYLOAD(%2,0)<1> ub2DEST_Y(%1)REGION(16,2)
mov (16) mubMSGPAYLOAD(%2,16)<1> ub2DEST_Y(%1+1)REGION(16,2)
}
send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_Y+nBI_DESTINATION_Y:ud
#endif
//** Save 8x8 packed U and V -----------------------------------------------------
// we could write directly wORIX to mMSGHDR and then execute asr on it, that way we could
// avoid using rMSGSRC as a buffer and have one command less in code, but it is unknown whether
//it is possible to do asr on mMSGHDR so we use rMSGSRC.
mov (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w // Block origin
mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_UV:ud // U/V block width and height (16x4)
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
$for(0,0; <nY_NUM_OF_ROWS;2,1) {
mov (16) mubMSGPAYLOAD(%2,0)<2> ub2DEST_U(%2)REGION(16,2)
mov (16) mubMSGPAYLOAD(%2,1)<2> ub2DEST_V(%2)REGION(16,2)
}
send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_UV+nBI_DESTINATION_UV:ud
//End of PL8x8_Save_P208.asm
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
//Module name: PL8x8_Save_P208.inc
//
// Setup for storing planar data
//
#include "undefall.inc" //Undefine the SRC and DEST sysmbols
#define nDPW_BLOCK_SIZE_Y nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8 // Y block size 16x8
#define nDPW_MSG_SIZE_Y nMSGLEN_4 // # of MRF's to hold Y block data (4)
#define nDPW_BLOCK_SIZE_UV nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8 // U/V interleaved block width and height (16x8)
#define nDPW_MSG_SIZE_UV nMSGLEN_4 // # of MRF's to hold U/V block data (4)
#if (nSRC_REGION==nREGION_1)
#define udSRC_Y udBOT_Y_IO
#define udSRC_U udBOT_U_IO
#define udSRC_V udBOT_V_IO
#define ubSRC_Y ubBOT_Y
#define ubSRC_U ubBOT_U
#define ubSRC_V ubBOT_V
#define uwSRC_U uwBOT_U //For masking operation
#define uwSRC_V uwBOT_V
#define ub2DEST_Y ub2TOP_Y
#define ub2DEST_U ub2TOP_U
#define ub2DEST_V ub2TOP_V
#elif (nSRC_REGION==nREGION_2)
#define udSRC_Y udTOP_Y_IO
#define udSRC_U udTOP_U_IO
#define udSRC_V udTOP_V_IO
#define ubSRC_Y ubTOP_Y
#define ubSRC_U ubTOP_U
#define ubSRC_V ubTOP_V
#define uwSRC_U uwTOP_U //For masking operation
#define uwSRC_V uwTOP_V
#define ub2DEST_Y ub2BOT_Y
#define ub2DEST_U ub2BOT_U
#define ub2DEST_V ub2BOT_V
#endif
///* Yoni - masking is not relevant for ILK?!?
//#define TEMP0 REG(r,54)
//.declare TEMP Base=TEMP0 ElementSize=2 SrcRegion=<8;8,1> Type=uw
///* Yoni - masking is not relevant for ILK?!?
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: PL8x8_Save_PA.asm
//
// Save planar YUV422 to packed YUV422 format data
//
// Note: SRC_* must reference to regions with data type "BYTE"
// in order to save to byte-aligned byte location
#include "PL8x8_Save_PA.inc"
add (4) pCF_Y_OFFSET<1>:uw ubDEST_CF_OFFSET<4;4,1>:ub nDEST_YUV_REG*nGRFWIB:w // Initial Y,U,V offset in YUV422 block
// Pack Y
$for(0; <nY_NUM_OF_ROWS; 1) {
mov (16) r[pCF_Y_OFFSET, %1*nGRFWIB]<2> ubSRC_Y(0,%1*32)
}
// Pack U/V
$for(0; <nUV_NUM_OF_ROWS; 1) {
mov (8) r[pCF_U_OFFSET, %1*nGRFWIB]<4> ubSRC_U(0, %1*16)
mov (8) r[pCF_V_OFFSET, %1*nGRFWIB]<4> ubSRC_V(0, %1*16)
}
shl (1) rMSGSRC.0<1>:d wORIX<0;1,0>:w 1:w { NoDDClr } // H. block origin need to be doubled
mov (1) rMSGSRC.1<1>:d wORIY<0;1,0>:w { NoDDClr, NoDDChk } // Block origin
mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_YUV:ud { NoDDChk } // Block width and height (32x8)
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
//Use the mask to determine which pixels shouldn't be over-written
and (1) acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud 0x00FFFFFF:ud
cmp.ge.f0.0(1) dNULLREG acc0.0<0;1,0>:ud 0x00FFFFFF:ud //Check if all pixels in the block need to be modified
(f0.0) jmpi WritePackedToDataPort
//If mask is not all 1's, then load the entire 32x8 block
//so that only those bytes may be modified that need to be (using the mask)
// Load 32x8 packed YUV 422 ----------------------------------------------------
send (8) udSRC_YUV(0)<1> mMSGHDR udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV+nBI_DESTINATION_YUV:ud
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
//Merge the data
mov (1) f0.0:uw ubBLOCK_MASK_V:ub //Load the mask on flag reg
(f0.0) mov (8) rMASK_TEMP<1>:uw uwBLOCK_MASK_H:uw
(-f0.0) mov (8) rMASK_TEMP<1>:uw 0:uw
// Destination is Byte aligned
$for(0; <nY_NUM_OF_ROWS; 1) {
mov (1) f0.1:uw uwMASK_TEMP(0,%1)<0;1,0>
(-f0.1) mov (16) uwDEST_YUV(%1)<1> uwSRC_YUV(%1) //check the UV merge - vK
}
WritePackedToDataPort:
// Packed YUV data are stored in one of the I/O regions before moving to MRF
// Note: This is necessary since indirect addressing is not supported for MRF.
// Packed data block should be saved as 32x8 pixel block
$for(0; <nY_NUM_OF_ROWS; 1) {
mov (8) mudMSGPAYLOAD(%1)<1> udDEST_YUV(%1)REGION(8,1)
}
send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_YUV+nBI_DESTINATION_YUV:ud
// End of PL8x8_Save_PA
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: PL8x8_Save_PA.inc
//
// Setup for storing packed data
//
#include "undefall.inc" //Undefine the SRC and DEST sysmbols
// For saving
#define nDPW_BLOCK_SIZE_YUV nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8 // YUV block size 32x8
#define nDPW_MSG_SIZE_YUV nMSGLEN_8 // # of MRF's to hold YUV block data (8)
// For masking
#undef nDPR_MSG_SIZE_YUV
#define nDPR_MSG_SIZE_YUV nRESLEN_8 // # of MRF's to hold YUV block data (8)
#define rMASK_TEMP REG(r,nTEMP0)
.declare uwMASK_TEMP Base=rMASK_TEMP ElementSize=2 SrcRegion=<8;8,1> Type=uw //1 GRF
#if (nSRC_REGION==nREGION_1)
// For saving
#define udSRC_YUV udTOP_Y_IO
#define udDEST_YUV udBOT_Y_IO
#define nDEST_YUV_REG nBOT_Y
//For masking operation
#define ubSRC_Y ub2TOP_Y
#define ubSRC_U ub2TOP_U
#define ubSRC_V ub2TOP_V
#define uwSRC_YUV uwTOP_Y
#define uwDEST_YUV uwBOT_Y
#elif (nSRC_REGION==nREGION_2)
// For saving
#define udSRC_YUV udBOT_Y_IO
#define udDEST_YUV udTOP_Y_IO
#define nDEST_YUV_REG nTOP_Y
//For masking operation
#define ubSRC_Y ub2BOT_Y
#define ubSRC_U ub2BOT_U
#define ubSRC_V ub2BOT_V
#define uwSRC_YUV uwBOT_Y
#define uwDEST_YUV uwTOP_Y
#endif
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: PL9x5_PL16x8.asm
#define EXPAND_9x5
#include "Expansion.inc"
//------------------------------ Horizontal Upconversion -----------------------------
$for (nUV_NUM_OF_ROWS-2; >-1; -1) {
avg.sat (16) uwDEST_U(0, %1*16)<1> uwDEST_U(0, %1*16)<1;2,0> uwDEST_U(0, %1*16)<1;2,1>
avg.sat (16) uwDEST_V(0, %1*16)<1> uwDEST_V(0, %1*16)<1;2,0> uwDEST_V(0, %1*16)<1;2,1>
}
#undef nUV_NUM_OF_ROWS
#define nUV_NUM_OF_ROWS 8 //use packed version of all post-processing kernels
//------------------------------- Vertical Upconversion ------------------------------
avg.sat (16) uwDEST_U(0, 3*32+16)<1> uwDEST_U(0, 3*16) uwDEST_U(0, (1+3)*16)
avg.sat (16) uwDEST_V(0, 3*32+16)<1> uwDEST_V(0, 3*16) uwDEST_V(0, (1+3)*16)
$for(nUV_NUM_OF_ROWS/2-2; >-1; -1) {
mov (16) uwDEST_U(0, (1+%1)*32)<1> uwDEST_U(0, (1+%1)*16)
avg.sat (16) uwDEST_U(0, %1*32+16)<1> uwDEST_U(0, %1*16) uwDEST_U(0, (1+%1)*16)
mov (16) uwDEST_V(0, (1+%1)*32)<1> uwDEST_V(0, (1+%1)*16)
avg.sat (16) uwDEST_V(0, %1*32+16)<1> uwDEST_V(0, %1*16) uwDEST_V(0, (1+%1)*16)
}
// End of PL9x5_PL16x8
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: PL9x5_PL16x8.asm
#include "Expansion.inc"
//------------------------------ Horizontal Upconversion -----------------------------
$for (0; <nUV_NUM_OF_ROWS; 1) {
avg.sat (16) uwDEST_U(0, %1*16)<1> uwDEST_U(0, %1*16)<1;2,0> uwDEST_U(0, %1*16)<1;2,1>
avg.sat (16) uwDEST_V(0, %1*16)<1> uwDEST_V(0, %1*16)<1;2,0> uwDEST_V(0, %1*16)<1;2,1>
}
// End of PL9x5_PL16x8
\ No newline at end of file
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: RGB16x8_Save_RGB.asm
//
// Save packed ARGB 444 frame data block of size 16x8
//
// To save 16x8 block (64x8 byte layout for ARGB8888) we need 2 send instructions
// ---------
// | 1 | 2 |
// ---------
#include "RGB16x8_Save_RGB.inc"
shl (1) rMSGSRC.0<1>:d wORIX<0;1,0>:w 2:w { NoDDClr } // H. block origin need to be quadrupled
mov (1) rMSGSRC.1<1>:d wORIY<0;1,0>:w { NoDDClr, NoDDChk } // Block origin (1st quadrant)
mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_ARGB:ud { NoDDChk } // Block width and height (32x8)
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
//Use the mask to determine which pixels shouldn't be over-written
and (1) acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud 0x00FFFFFF:ud
cmp.ge.f0.0(1) dNULLREG acc0.0<0;1,0>:ud 0x00FFFFFF:ud //Check if all pixels in the block need to be modified
(f0.0) jmpi WriteARGBToDataPort
//If mask is not all 1's, then load the entire 64x8 block
//so that only those bytes may be modified that need to be (using the mask)
// Load first block 16x8 packed ARGB 444 ---------------------------------------
or (1) acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud 0xFF00FF00:ud //Check first block
cmp.e.f0.0 (1) dNULLREG acc0.0<0;1,0>:ud 0xFFFFFFFF:ud
(f0.0) jmpi SkipFirstBlockMerge //If full mask then skip this block
send (8) udSRC_ARGB(0)<1> mMSGHDR udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
//Merge the data
mov (1) f0.0:uw ubBLOCK_MASK_V:ub //Load the mask on flag reg
(f0.0) mov (8) rMASK_TEMP<1>:uw uwBLOCK_MASK_H:uw //use sel instruction - vK
(-f0.0) mov (8) rMASK_TEMP<1>:uw 0:uw
$for(0, 0; <nY_NUM_OF_ROWS; 1, 2) { //take care of the lines in the block, they are different in the src and dest
mov (1) f0.1:uw uwMASK_TEMP(0,%1)<0;1,0>
(-f0.1) mov (8) udDEST_ARGB(%2)<1> udSRC_ARGB(%1)
}
SkipFirstBlockMerge:
// Load second block 16x8 packed ARGB 444 ---------------------------------------
or (1) acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud 0xFF0000FF:ud //Check second block
cmp.e.f0.0 (1) dNULLREG acc0.0<0;1,0>:ud 0xFFFFFFFF:ud
(f0.0) jmpi WriteARGBToDataPort //If full mask then skip this block
add (1) mMSGHDR.0<1>:d rMSGSRC.0<0;1,0>:d 32:d // Point to 2nd part
send (8) udSRC_ARGB(0)<1> mMSGHDR udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud // Point to 1st part again
//Merge the data
mov (1) f0.0:uw ubBLOCK_MASK_V:ub //Load the mask on flag reg
(f0.0) shr (8) rMASK_TEMP<1>:uw uwBLOCK_MASK_H:uw 8:uw //load the mask for second block
(-f0.0) mov (8) rMASK_TEMP<1>:uw 0:uw
$for(0, 1; <nY_NUM_OF_ROWS; 1, 2) { //take care of the lines in the block, they are different in the src and dest
mov (1) f0.1:uw uwMASK_TEMP(0,%1)<0;1,0>
(-f0.1) mov (8) udDEST_ARGB(%2)<1> udSRC_ARGB(%1)
}
WriteARGBToDataPort:
// Move packed data to MRF and output
$for(0; <nY_NUM_OF_ROWS; 1) {
mov (8) mudMSGPAYLOAD(%1)<1> udDEST_ARGB(%1*2)
}
send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
add (1) mMSGHDR.0<1>:d rMSGSRC.0<0;1,0>:d 32:d // Point to 2nd part
$for(0; <nY_NUM_OF_ROWS; 1) {
mov (8) mudMSGPAYLOAD(%1)<1> udDEST_ARGB(%1*2+1)
}
send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
// End of RGB16x8_Save_RGB
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: RGB16x8_Save_RGB.inc
//
#include "undefall.inc" //Undefine the SRC and DEST sysmbols
// For saving
#define nDPW_BLOCK_SIZE_ARGB nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8 // ARGB block size 32x8
#define nDPW_MSG_SIZE_ARGB nMSGLEN_8 // # of MRF's to hold ARGB block data (8)
// For masking
#undef nDPR_MSG_SIZE_ARGB
#define nDPR_MSG_SIZE_ARGB nRESLEN_8 // # of MRF's to hold ARGB block data (8)
#define rMASK_TEMP REG(r,nTEMP0)
.declare uwMASK_TEMP Base=rMASK_TEMP ElementSize=2 SrcRegion=<8;8,1> Type=uw //1 GRF
#if (nSRC_REGION==nREGION_1)
// For saving
#define udDEST_ARGB udTOP_Y_IO //The output of previous stage is stored here; This is modified and is written to render cache
//For masking operation
#define udSRC_ARGB udBOT_Y_IO //To hold the destination data that shouldn't be modified
#elif (nSRC_REGION==nREGION_2)
// For saving
#define udDEST_ARGB udBOT_Y_IO //The output of previous stage is stored here; This is modified and is written to render cache
//For masking operation
#define udSRC_ARGB udTOP_Y_IO //To hold the destination data that shouldn't be modified
#endif
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: RGB16x8_Save_RGB16.asm
//
// Save packed RGB565 frame data block of size 16x8
//
// To save 16x8 block (32x8 byte layout for RGB565) we need 1 send instruction
// -----
// | 1 |
// -----
#include "RGB16x8_Save_RGB16.inc"
//convert 32 bit RGB to 16 bit RGB
// Truncate A8R8G8B8 to A6R5G6B5 within byte.
// That is keeping 5 MSB of R and B, and 6 MSB of G.
$for (0, 0; <nY_NUM_OF_ROWS; 1, 2) {
shr uwCSC_TEMP(%1,0)<1> ubDEST_ARGB(%2,0)<32;8,4> 3:w // B >> 3
shl (16) uwTEMP_RGB16(0)<1> uwDEST_ARGB(%2,1)<16;8,2> 8:w // R << 8
and (16) uwTEMP_RGB16(0)<1> uwTEMP_RGB16(0) 0xF800:uw
or (16) uwCSC_TEMP(%1,0)<1> uwCSC_TEMP(%1,0)<16;16,1> uwTEMP_RGB16(0)
shr (16) uwTEMP_RGB16(0)<1> uwDEST_ARGB(%2,0)<16;8,2> 5:w // G >> 5
and (16) uwTEMP_RGB16(0)<1> uwTEMP_RGB16(0) 0x07E0:uw
or (16) uwCSC_TEMP(%1,0)<1> uwCSC_TEMP(%1,0)<16;16,1> uwTEMP_RGB16(0)
}
mov (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w // Block origin (1st quadrant)
shl (1) rMSGSRC.0<1>:d wORIX<0;1,0>:w 1:w // H. block origin need to be doubled for byte offset
mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_RGB16:ud // Block width and height (32x8)
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
//Use the mask to determine which pixels shouldn't be over-written
and (1) acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud 0x00FFFFFF:ud
cmp.ge.f0.0(1) dNULLREG acc0.0<0;1,0>:ud 0x00FFFFFF:ud //Check if all pixels in the block need to be modified
(f0.0) jmpi WriteRGB16ToDataPort
//If mask is not all 1's, then load the entire 32x8 block
//so that only those bytes may be modified that need to be (using the mask)
// Load 32x8 packed RGB565 -----------------------------------------------------
send (8) udSRC_RGB16(0)<1> mMSGHDR udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_RGB16+nBI_DESTINATION_RGB:ud
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
//Merge the data
mov (1) f0.0:uw ubBLOCK_MASK_V:ub //Load the mask on flag reg
(f0.0) mov (8) rMASK_TEMP<1>:uw uwBLOCK_MASK_H:uw //use sel instruction - vK
(-f0.0) mov (8) rMASK_TEMP<1>:uw 0:uw
$for(0; <nY_NUM_OF_ROWS; 1) {
mov (1) f0.1:uw uwMASK_TEMP(0,%1)<0;1,0>
(-f0.1) mov (16) uwCSC_TEMP(%1)<1> uwSRC_RGB16(%1)
}
WriteRGB16ToDataPort:
// Move packed data to MRF and output
$for(0; <nY_NUM_OF_ROWS; 1) {
mov (8) mudMSGPAYLOAD(%1)<1> udCSC_TEMP(%1)
}
send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_RGB16+nBI_DESTINATION_RGB:ud
// End of RGB16x8_Save_RGB16
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: RGB16x8_Save_RGB16.inc
//
#include "undefall.inc" //Undefine the SRC and DEST sysmbols
// For saving
#define nDPW_BLOCK_SIZE_RGB16 nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8 // RGB16 block size 32x8
#define nDPW_MSG_SIZE_RGB16 nMSGLEN_8 // # of MRF's to hold RGB16 block data (8)
// For conversion to 16bit
.declare uwTEMP_RGB16 Base=REG(r,nTEMP1) ElementSize=2 SrcRegion=<16;16,1> Type=uw //1 GRF
// For masking
#undef nDPR_MSG_SIZE_RGB16
#define nDPR_MSG_SIZE_RGB16 nRESLEN_8 // # of MRF's to hold ARGB block data (8)
#define rMASK_TEMP REG(r,nTEMP0)
.declare uwMASK_TEMP Base=rMASK_TEMP ElementSize=2 SrcRegion=<8;8,1> Type=uw //1 GRF
#if (nSRC_REGION==nREGION_1)
// For saving
#define ubDEST_ARGB ubTOP_Y //Data from previous module
#define uwDEST_ARGB uwTOP_Y //Data from previous module
#define udCSC_TEMP udBOT_Y_IO //Data Converted to 16 bits
#define uwCSC_TEMP uwBOT_Y
//For masking operation
#define udSRC_RGB16 udTOP_Y_IO //To hold the destination data that shouldn't be modified
#define uwSRC_RGB16 uwTOP_Y
#elif (nSRC_REGION==nREGION_2)
// For saving
#define ubDEST_ARGB ubBOT_Y //Data from previous module
#define uwDEST_ARGB uwBOT_Y //Data from previous module
#define udCSC_TEMP udTOP_Y_IO //Data Converted to 16 bits
#define uwCSC_TEMP uwTOP_Y
//For masking operation
#define udSRC_RGB16 udBOT_Y_IO //To hold the destination data that shouldn't be modified
#define uwSRC_RGB16 uwBOT_Y
#endif
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: RGB16x8_Save_Y416.asm
//
// Save packed ARGB 444 frame data block of size 16x8
//
// To save 16x8 block (128x8 byte layout for ARGB 16bit per component) we need 4 send instructions
// -----------------
// | 1 | 2 | 3 | 4 |
// -----------------
#include "RGB16x8_Save_RGB.inc"
shl (1) rMSGSRC.0<1>:d wORIX<0;1,0>:w 3:w { NoDDClr } // H. block origin need to become 8 times
mov (1) rMSGSRC.1<1>:d wORIY<0;1,0>:w { NoDDClr, NoDDChk } // Block origin (1st quadrant)
mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_ARGB:ud { NoDDChk } // Block width and height (32x8)
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
/* Not needed for validation kernels for now -vK
//Use the mask to determine which pixels shouldn't be over-written
and (1) acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud 0x00FFFFFF:ud
cmp.ge.f0.0(1) dNULLREG acc0.0<0;1,0>:ud 0x00FFFFFF:ud //Check if all pixels in the block need to be modified
(f0.0) jmpi WriteARGBToDataPort
//If mask is not all 1's, then load the entire 64x8 block
//so that only those bytes may be modified that need to be (using the mask)
// Load first block 16x8 packed ARGB 444 ---------------------------------------
or (1) acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud 0xFF00FF00:ud //Check first block
cmp.e.f0.0 (1) dNULLREG acc0.0<0;1,0>:ud 0xFFFFFFFF:ud
(f0.0) jmpi SkipFirstBlockMerge //If full mask then skip this block
send (8) udSRC_ARGB(0)<1> mMSGHDR udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
//Merge the data
mov (1) f0.0:uw ubBLOCK_MASK_V:ub //Load the mask on flag reg
(f0.0) mov (8) rMASK_TEMP<1>:uw uwBLOCK_MASK_H:uw //use sel instruction - vK
(-f0.0) mov (8) rMASK_TEMP<1>:uw 0:uw
$for(0, 0; <nY_NUM_OF_ROWS; 1, 2) { //take care of the lines in the block, they are different in the src and dest
mov (1) f0.1:uw uwMASK_TEMP(0,%1)<0;1,0>
(-f0.1) mov (8) udDEST_ARGB(%2)<1> udSRC_ARGB(%1)
}
SkipFirstBlockMerge:
// Load second block 16x8 packed ARGB 444 ---------------------------------------
or (1) acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud 0xFF0000FF:ud //Check second block
cmp.e.f0.0 (1) dNULLREG acc0.0<0;1,0>:ud 0xFFFFFFFF:ud
(f0.0) jmpi WriteARGBToDataPort //If full mask then skip this block
add (1) mMSGHDR.0<1>:d rMSGSRC.0<0;1,0>:d 32:d // Point to 2nd part
send (8) udSRC_ARGB(0)<1> mMSGHDR udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud // Point to 1st part again
//Merge the data
mov (1) f0.0:uw ubBLOCK_MASK_V:ub //Load the mask on flag reg
(f0.0) shr (8) rMASK_TEMP<1>:uw uwBLOCK_MASK_H:uw 8:uw //load the mask for second block
(-f0.0) mov (8) rMASK_TEMP<1>:uw 0:uw
$for(0, 1; <nY_NUM_OF_ROWS; 1, 2) { //take care of the lines in the block, they are different in the src and dest
mov (1) f0.1:uw uwMASK_TEMP(0,%1)<0;1,0>
(-f0.1) mov (8) udDEST_ARGB(%2)<1> udSRC_ARGB(%1)
}
*/
WriteARGBToDataPort:
// Move packed data to MRF and output
//Write 1st 4X8 pixels
$for(0; <nY_NUM_OF_ROWS; 1) {
mov (8) mudMSGPAYLOAD(%1)<1> udDEST_ARGB(%1*4)
}
send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
//Write 2nd 4X8 pixels
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
add (1) mMSGHDR.0<1>:d rMSGSRC.0<0;1,0>:d 32:d // Point to 2nd part
$for(0; <nY_NUM_OF_ROWS; 1) {
mov (8) mudMSGPAYLOAD(%1)<1> udDEST_ARGB(%1*4+1)
}
send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
//Write 3rd 4X8 pixels
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
add (1) mMSGHDR.0<1>:d rMSGSRC.0<0;1,0>:d 64:d // Point to 2nd part
$for(0; <nY_NUM_OF_ROWS; 1) {
mov (8) mudMSGPAYLOAD(%1)<1> udDEST_ARGB(%1*4+2)
}
send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
//Write 4th 4X8 pixels
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
add (1) mMSGHDR.0<1>:d rMSGSRC.0<0;1,0>:d 96:d // Point to 2nd part
$for(0; <nY_NUM_OF_ROWS; 1) {
mov (8) mudMSGPAYLOAD(%1)<1> udDEST_ARGB(%1*4+3)
}
send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
// End of RGB16x8_Save_Y416
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: RGB16x8_Save_Y416.inc
//
#include "undefall.inc" //Undefine the SRC and DEST sysmbols
// For saving
#define nDPW_BLOCK_SIZE_ARGB nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8 // ARGB block size 32x8
#define nDPW_MSG_SIZE_ARGB nMSGLEN_8 // # of MRF's to hold ARGB block data (8)
// For masking
#undef nDPR_MSG_SIZE_ARGB
#define nDPR_MSG_SIZE_ARGB nRESLEN_8 // # of MRF's to hold ARGB block data (8)
#define rMASK_TEMP REG(r,nTEMP0)
.declare uwMASK_TEMP Base=rMASK_TEMP ElementSize=2 SrcRegion=<8;8,1> Type=uw //1 GRF
#if (nSRC_REGION==nREGION_1)
// For saving
#define udDEST_ARGB udTOP_Y_IO //The output of previous stage is stored here; This is modified and is written to render cache
//For masking operation
#define udSRC_ARGB udBOT_Y_IO //To hold the destination data that shouldn't be modified
#elif (nSRC_REGION==nREGION_2)
// For saving
#define udDEST_ARGB udBOT_Y_IO //The output of previous stage is stored here; This is modified and is written to render cache
//For masking operation
#define udSRC_ARGB udTOP_Y_IO //To hold the destination data that shouldn't be modified
#endif
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
.declare SRC_B Base=REG(r,10) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
.declare SRC_G Base=REG(r,18) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
.declare SRC_R Base=REG(r,26) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
.declare SRC_A Base=REG(r,34) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
#define DEST_ARGB ubBOT_ARGB
#undef nSRC_REGION
#define nSRC_REGION nREGION_2
//Pack directly to mrf as optimization - vK
$for(0, 0; <8; 1, 2) {
// mov (16) DEST_ARGB(%2,0)<4> SRC_B(%1) { Compr, NoDDClr } // 16 B
// mov (16) DEST_ARGB(%2,1)<4> SRC_G(%1) { Compr, NoDDClr, NoDDChk } // 16 G
// mov (16) DEST_ARGB(%2,2)<4> SRC_R(%1) { Compr, NoDDClr, NoDDChk } // 16 R //these 2 inst can be merged - vK
// mov (16) DEST_ARGB(%2,3)<4> SRC_A(%1) { Compr, NoDDChk } //DEST_RGB_FORMAT<0;1,0>:ub { Compr, NoDDChk } // 16 A
mov (8) DEST_ARGB(%2, 0)<4> SRC_B(%1) { NoDDClr } // 8 B
mov (8) DEST_ARGB(%2, 1)<4> SRC_G(%1) { NoDDClr, NoDDChk } // 8 G
mov (8) DEST_ARGB(%2, 2)<4> SRC_R(%1) { NoDDClr, NoDDChk } // 8 R
mov (8) DEST_ARGB(%2, 3)<4> SRC_A(%1) { NoDDChk } // 8 A
mov (8) DEST_ARGB(%2+1,0)<4> SRC_B(%1,8) { NoDDClr } // 8 B
mov (8) DEST_ARGB(%2+1,1)<4> SRC_G(%1,8) { NoDDClr, NoDDChk } // 8 G
mov (8) DEST_ARGB(%2+1,2)<4> SRC_R(%1,8) { NoDDClr, NoDDChk } // 8 R
mov (8) DEST_ARGB(%2+1,3)<4> SRC_A(%1,8) { NoDDChk } // 8 A
}
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Modual name: SetupVPKernel.asm
//
// Initial setup for running video-processing kernels
//
#include "common.inc"
//
// Now, begin source code....
//
.code
#include "Init_All_Regs.asm"
mov (8) rMSGSRC.0<1>:ud r0.0<8;8,1>:ud // Initialize message payload header with R0
#if defined (INC_BLENDING)
mul (1) fALPHA_STEP_X:f fSCALING_STEP_RATIO:f fVIDEO_STEP_X:f //StepX_ratio = AlphaStepX / VideoStepX
#endif
// End of SetupVPKernel
This diff is collapsed.
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: readSampler16x1.asm
//
// Read one row of pix through sampler
//
//#define SAMPLER_MSG_DSC 0x166A0000 // ILK Sampler Message Descriptor
// Send Message [DevILK] Message Descriptor
// MBZ MsgL=5 MsgR=8 H MBZ SIMD MsgType SmplrIndx BindTab
// 000 0 101 0 1000 1 0 10 0000 0000 00000000
// 0 A 8 A 0 0 0 0
// MsgL=1+2*2(u,v)=5 MsgR=8
#define SAMPLER_MSG_DSC 0x0A8A0000 // ILK Sampler Message Descriptor
// Assume MSGSRC is set already in the caller
//mov (8) rMSGSRC.0<1>:ud 0:ud // Unused fileds
// Read 16 sampled pixels and stored them in float32 in 8 GRFs
// 422 data is expanded to 444, return 8 GRF in the order of RGB- (UYV-).
// 420 data has three surfaces, return 8 GRF. Valid is always in the 1st GRF when in R8. Make sure no overwrite the following 3 GRFs.
// alpha data is expanded to 4444, return 8 GRF in the order of RGBA (UYVA).
mov(16) mMSGHDR<1>:uw rMSGSRC<16;16,1>:uw
send (16) DATABUF(0)<1> mMSGHDR udDUMMY_NULL 0x2 SAMPLER_MSG_DSC+SAMPLER_IDX+BINDING_IDX:ud
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Modual name: undefall.inc
//
// undefine all global symbol for new process
//
//Source definitions
#undef ubSRC_Y
#undef ubSRC_U
#undef ubSRC_V
#undef ub2SRC_Y
#undef ub2SRC_U
#undef ub2SRC_V
#undef ub4SRC_Y
#undef ub4SRC_U
#undef ub4SRC_V
#undef uwSRC_Y
#undef uwSRC_U
#undef uwSRC_V
#undef udSRC_Y
#undef udSRC_U
#undef udSRC_V
#undef udSRC_YUV
#undef nSRC_YUV_REG
//Destination definitions
#undef ubDEST_Y
#undef ubDEST_U
#undef ubDEST_V
#undef ub2DEST_Y
#undef ub2DEST_U
#undef ub2DEST_V
#undef ub4DEST_Y
#undef ub4DEST_U
#undef ub4DEST_V
#undef uwDEST_Y
#undef uwDEST_U
#undef uwDEST_V
#undef udDEST_Y
#undef udDEST_U
#undef udDEST_V
#undef udDEST_YUV
#undef nDEST_YUV_REG
#undef ubDEST_ARGB
// End of undefall.inc
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: AVS_IEF.inc
#ifndef _AVS_INF_INC_
#define _AVS_INF_INC_
#include "undefall.inc" //Undefine the SRC and DEST sysmbols
// Message Header
// m0.7 31:0 Debug
// m0.6 31:0 Debug
// m0.5 31:0 Ignored
// m0.4 31:0 Ignored
// m0.3 31:0 Ignored
// m0.2 31:16 Ignored
// 15 Alpha Write Channel Mask enable=0, disable=1
// 14 Blue Write Channel Mask (V)
// 13 Green Write Channel Mask (Y)
// 12 Red Write Channel Mask (U)
// 11:0 Ignored
// m0.1 Ignored
// m0.0 Ignored
#define mAVS_8x8_HDR m0 // Message Header
#define mAVS_PAYLOAD m1 // Message Payload Header
#define mAVS_8x8_HDR_2 m2 // Message Header
#define mAVS_PAYLOAD_2 m3 // Message Payload Header
#define mAVS_8x8_HDR_UV m2 // Message Header
#define mAVS_PAYLOAD_UV m3 // Message Payload Header
#define rAVS_8x8_HDR rMSGSRC // Mirror of Message Header
#define rAVS_PAYLOAD r9 // Mirror of Message Payload Header
// AVS payload
// m1.7 Ignored
// m1.6 Pixel 0 V Address ---> ORIY (Y0)
// m1.5 Delta V ---> Step Y
// m1.4 Ignored
// m1.3 Ignored
// m1.2 Pixel 0 U Address ---> ORIX (X0)
// m1.1 U 2nd Derivative ---> NLAS dx
// m1.0 Delta U ---> Step X
// Sampler Message Descriptor
// 31:29 Reserved 000
// 28:25 Message length 0010
// 24:20 Response length xxxxx ---> 4GRFs for each enabled channel
// 19 Header Present 1
// 18 MBZ 0
// 17:16 SIMD Mode 11 ---> SIMD64
// 15:12 Message Type 0011 ---> sample_8x8
// 11:8 Sampler Index xxxx
// 7:0 Binding Table Index xxxxxxxx
#define nAVS_MSG_DSC_1CH 0x044BB000
#define nAVS_MSG_DSC_2CH 0x048BB000
#define nAVS_MSG_DSC_3CH 0x04CBB000
#define nAVS_MSG_DSC_4CH 0x050BB000
#define nAVS_RED_CHANNEL_ONLY 0x0000E000 // Enable Red channel only
#define nAVS_GREEN_CHANNEL_ONLY 0x0000D000 // Enable Green channel only
#define nAVS_RED_BLUE_CHANNELS 0x0000A000 // Enable Red and Blue channels
#define nAVS_RGB_CHANNELS 0x00008000 // Enable RGB(YUV) channels
#define nAVS_ALL_CHANNELS 0x00000000 // Enable all channels (ARGB\AYUV)
.declare ubAVS_RESPONSE Base=REG(r,nTEMP8) ElementSize=1 SrcRegion=REGION(16,1) Type=ub
.declare uwAVS_RESPONSE Base=REG(r,nTEMP8) ElementSize=2 SrcRegion=REGION(16,1) Type=uw
.declare ubAVS_RESPONSE_2 Base=REG(r,nTEMP24) ElementSize=1 SrcRegion=REGION(16,1) Type=ub
.declare uwAVS_RESPONSE_2 Base=REG(r,nTEMP24) ElementSize=2 SrcRegion=REGION(16,1) Type=uw
#if (nSRC_REGION==nREGION_2)
#define uwDEST_Y uwBOT_Y
#define uwDEST_U uwBOT_U
#define uwDEST_V uwBOT_V
#define ubDEST_Y ubBOT_Y
#undef nSRC_REGION
#define nSRC_REGION nREGION_2
#else //(nSRC_REGION==nREGION_1)
#define uwDEST_Y uwTOP_Y
#define uwDEST_U uwTOP_U
#define uwDEST_V uwTOP_V
#define ubDEST_Y ubTOP_Y
#undef nSRC_REGION
#define nSRC_REGION nREGION_1
#endif
#endif //_AVS_INF_INC_
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
//------------------------------------------------------------------------------
// AVS_SetupFirstBlock.asm
//------------------------------------------------------------------------------
// Setup Message Header
// mov (8) mAVS_8x8_HDR<1>:ud rMSGSRC<8;8,1>:ud
// Check NLAS Enable bit
and.z.f0.0 (1) wNULLREG uwNLAS_ENABLE:uw BIT15:uw
(f0.0)mov (1) fVIDEO_STEP_DELTA:f 0.0:f
// Setup Message Payload Header for 1st block of Media Sampler 8x8
mov (1) rAVS_PAYLOAD.0:f fVIDEO_STEP_DELTA:f //NLAS dx
mov (1) rAVS_PAYLOAD.1:f fVIDEO_STEP_X:f //Step X
mov (1) rAVS_PAYLOAD.5:f fVIDEO_STEP_Y:f //Step Y
mov (2) rAVS_PAYLOAD.2<4>:f fSRC_VID_H_ORI<2;2,1>:f //Orig X and Y
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
//------------------------------------------------------------------------------
// AVS_SetupSecondBlock.asm
//------------------------------------------------------------------------------
//NLAS calculations for 2nd block of Media Sampler 8x8:
// X(i) = X0 + dx*i + ddx*i*(i-1)/2 ==> X(8) = X0 + dx*8 +ddx*28
// dx(i)= dx(0) + ddx*i ==> dx(8)= dx + ddx*8
// Calculating X(8)
mov (1) acc0.2<1>:f fSRC_VID_H_ORI:f
mac (1) acc0.2<1>:f fVIDEO_STEP_X:f 8.0:f
mac (1) rAVS_PAYLOAD.2:f fVIDEO_STEP_DELTA:f 28.0:f
// Calculating dx(8)
mov (1) acc0.1<1>:f fVIDEO_STEP_X:f
mac (1) rAVS_PAYLOAD.1:f fVIDEO_STEP_DELTA:f 8.0:f
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: DI.inc
#ifdef GT
// GT DI Kernel
#else // ILK
// ILK DI Kernel
#endif
//---------------------------------------------------------------------------
// Binding table indices
//---------------------------------------------------------------------------
#define nBIDX_DI_PRV 10 // Previous DI-ed frame
#define nBIDX_DI_CUR 13 // Current DI-ed frame
#define nBIDX_DN 7 // Denoised frame
#define nBIDX_STAT 20 // Statistics
#define nBIDX_DI_Source 4 // Source Surface
//---------------------------------------------------------------------------
// Message descriptors
//---------------------------------------------------------------------------
// Extended message descriptor
#define nSMPL_ENGINE 0x2
#define nDATAPORT_WRITE 0x5
#define nTS_EOT 0x27 // with End-Of-Thread bit ON
// Message descriptor for end-of-thread
// = 000 0001 (message len) 00000 (resp len)
// 0 (header present 0) 00000000000000 0 (URB dereferenced) 0000
#define nEOT_MSGDSC 0x02000000
// Message descriptor for sampler read
// = 000 0010 (message len 2) 00000 (resp len - set later, 12 or 5 or 11)
// 1 (header present 1) 0 11 (SIMD32/64 mode)
// 1000 (message type) 0000 (DI state index)
// 00000000 (binding table index - set later)
// = 0x040b8000
// comment begin
// The following is commented out because of walker feature
// It corresponds to the #ifdef GT #else and #endif
//#define nSMPL_MSGDSC 0x040b8000
//#define nSMPL_RESP_LEN_DI 0x00c00000 // 12
//#define nSMPL_RESP_LEN_NODI_PL 0x00500000 // 5
//#define nSMPL_RESP_LEN_NODI_PA 0x00900000 // 9
//#define nSMPL_RESP_LEN_NODN 0x00900000 // 9
//#define nSMPL_RESP_LEN_PDI 0x00b00000 // 11
// comment end
#ifdef GT
#define nSMPL_MSGDSC 0x040b8000
#define nSMPL_RESP_LEN_DI 0x00c00000 // 12
#define nSMPL_RESP_LEN_NODI_PL 0x00500000 // 5 //DI disable, the XY stored in 5th GRF, no impact to return length
#define nSMPL_RESP_LEN_NODI_PA 0x00900000 // 9 //DI disable, the XY stored in 5th GRF, no impact to return length
#define nSMPL_RESP_LEN_NODN 0x00a00000 // 10 //NO DN, originally use 9, now we need use 10 to store the XY with walker
#define nSMPL_RESP_LEN_PDI 0x00b00000 // 11
#else
#define nSMPL_MSGDSC 0x040b8000
#define nSMPL_RESP_LEN_DI 0x00c00000 // 12
#define nSMPL_RESP_LEN_NODI_PL 0x00500000 // 5
#define nSMPL_RESP_LEN_NODI_PA 0x00900000 // 9
#define nSMPL_RESP_LEN_NODN 0x00900000 // 9
#define nSMPL_RESP_LEN_PDI 0x00b00000 // 11
#endif
// Message descriptor for dataport media write
#ifdef GT
// = 000 0000 (message len - set later) 00000 (resp len 0)
// 1 (header present 1) 0 0 1010 (media block write) 00000
// 00000000 (binding table index - set later)
// = 0x00094000
#define nDPMW_MSGDSC 0x00094000
#else // ILK
// = 000 0000 (message len - set later) 00000 (resp len 0)
// 1 (header present 1) 000 0 010 (media block write) 0000
// 00000000 (binding table index - set later)
// = 0x00082000
#define nDPMW_MSGDSC 0x00082000
#endif
#define nDPMW_MSG_LEN_STMM 0x04000000 // 2 - STMM
#define nDPMW_MSG_LEN_DH 0x04000000 // 2 - Denoise history
#define nDPMW_MSG_LEN_PA_DN 0x0a000000 // 5 - Denoised output
#define nDPMW_MSG_LEN_PA_NODI 0x12000000 // 9 - Denoised output - denoise only - DI disabled
#define nDPMW_MSG_LEN_PL_DN 0x06000000 // 3 - Denoised output
#define nDPMW_MSG_LEN_PL_NODI 0x0a000000 // 5 - Denoised output - denoise only - DI disabled
#define nDPMW_MSG_LEN_DI 0x0a000000 // 5 - DI output
//---------------------------------------------------------------------------
// Static and inline parameters
//---------------------------------------------------------------------------
// Static parameters
.declare ubTFLD_FIRST Base=r1.27 ElementSize=1 Type=ub // top field first
.declare ubSRCYUVOFFSET Base=r1.4 ElementSize=1 Type=ub // source packed format
.declare ubDSTYUVOFFSET Base=r1.8 ElementSize=1 Type=ub // destination packed format
.declare uwSPITCH_DIV2 Base=r1.10 ElementSize=2 Type=uw // statistics surface pitch divided by 2
// Inline parameters
.declare uwXORIGIN Base=r5.0 ElementSize=2 Type=uw // X and Y origin
.declare uwYORIGIN Base=r5.1 ElementSize=2 Type=uw
//---------------------------------------------------------------------------
// Kernel GRF variables
//---------------------------------------------------------------------------
// Message response (Denoised & DI-ed pixels & statistics)
.declare dRESP Base=r8 ElementSize=4 Type=d // Response message (12 or 5 or 11)
.declare ubRESP Base=r8 ElementSize=1 Type=ub
.declare dSTMM Base=r16 ElementSize=4 Type=d // STMM
.declare ubDN_HIST_NODI Base=r12 ElementSize=1 Type=ub // Denoise history data (DI disabled)
.declare ubDN_HIST_DI Base=r17 ElementSize=1 Type=ub // Denoise history data (DI enabled)
.declare uwRETURNED_POSITION_DI Base=r17 ElementSize=2 Type=uw // XY_Return_Data (DI enabled)
.declare uwRETURNED_POSITION_DN Base=r12 ElementSize=2 Type=uw // XY_Return_Data (DI disabled)
.declare ub1ST_FLD_DN Base=r12 ElementSize=1 Type=ub // 1st field Denoised data (DI enabled)
.declare d1ST_FLD_DN Base=r12 ElementSize=4 Type=d
.declare ub2ND_FLD_DN Base=r18 ElementSize=1 Type=ub // 2nd field Denoised data (DI enabled)
.declare d2ND_FLD_DN Base=r18 ElementSize=4 Type=d
.declare ubPRV_DI Base=r8 ElementSize=1 Type=ub // Previous frame DI (DI enabled)
.declare ubCUR_DI Base=r12 ElementSize=1 Type=ub // Previous frame DI (DI enabled)
// Packed denoised output
.declare ubDN_YUV Base=r22 ElementSize=1 Type=ub // Denoised YUV422
.declare dDN_YUV Base=r22 ElementSize=4 Type=d
#define npDN_YUV 704 // = 22*32 = 0x280
// Packed DI output
.declare dDI_YUV_PRV Base=r32 ElementSize=4 Type=d // Previous frame DI output
.declare dDI_YUV_CUR Base=r36 ElementSize=4 Type=d // Current frame DI output
#define npDI_YUV 1024 // = 32*32 = 0x
// For packed output
#define p422_YOFFSET a0.2
#define p422_UOFFSET a0.3
#define p422_VOFFSET a0.4
#define pDN_TFLDSRC a0.6
#define pDN_BFLDSRC a0.7
#define npRESP 192 // = 6*32
// Message source
.declare udMSGSRC Base=r70 ElementSize=4 Type=ud
.declare uwMSGSRC Base=r70 ElementSize=2 Type=uw
.declare dMSGSRC Base=r70 ElementSize=4 Type=d
//---------------------------------------------------------------------------
// Kernel MRF variables
//---------------------------------------------------------------------------
#define mMSGHDR_SMPL m1 // Sampler response: m1~m2
.declare mudMSGHDR_SMPL Base=m1 ElementSize=4 Type=ud
.declare muwMSGHDR_SMPL Base=m1 ElementSize=2 Type=uw
#define mMSGHDR_DN m3 // Denoise output: m3~m7 for PA, m3~m5 for PL
.declare mdMSGHDR_DN Base=m3 ElementSize=4 Type=d
#define mMSGHDR_STAT m8 // Statistics output: m8~m9
.declare mdMSGHDR_STAT Base=m8 ElementSize=4 Type=d
.declare mubMSGHDR_STAT Base=m8 ElementSize=1 Type=ub
#define mMSGHDR_DI m10 // DI output: m10~m14
.declare mdMSGHDR_DI Base=m10 ElementSize=4 Type=d
#define mMSGHDR_EOT m15 // EOT
#ifdef GT
#define MSGSRC
#else
#define MSGSRC null:ud
#endif
//---------------------------------------------------------------------------
// End of thread instruction
//---------------------------------------------------------------------------
#ifdef GT
#define END_THREAD send (8) null<1>:d mMSGHDR_EOT nTS_EOT nEOT_MSGDSC
#else // ILK
#define END_THREAD send (8) null<1>:d mMSGHDR_EOT null:ud nTS_EOT nEOT_MSGDSC
#endif
// end of DI.inc
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Write denoise history to memory
shr (2) rMSGSRC.0<1>:ud wORIX<2;2,1>:w 2:w NODDCLR // X,Y origin / 4
add (1) rMSGSRC.0<1>:ud rMSGSRC.0<0;1,0>:ud uwSPITCH_DIV2<0;1,0>:uw NODDCLR_NODDCHK // Add pitch to X origin
mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_HIST:ud NODDCHK // block width and height (4x2)
mov (8) mMSGHDR_HIST<1>:ud rMSGSRC.0<8;8,1>:ud // message header
mov (1) mudMSGHDR_HIST(1)<1> udRESP(nDI_HIST_OFFSET,0)<0;1,0> // Move denoise history to MRF
send (8) dNULLREG mMSGHDR_HIST udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPMW_MSG_LEN_HIST+nBI_STMM_HISTORY_OUTPUT:ud
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
shl (1) rMSGSRC.0<1>:ud wORIX<0;1,0>:w 1:w NODDCLR // H. block origin need to be doubled
mov (1) rMSGSRC.1<1>:ud wORIY<0;1,0>:w NODDCLR_NODDCHK // Block origin
mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_DI:ud NODDCHK // Block width and height (32x8)
add (4) pCF_Y_OFFSET<1>:uw ubDEST_CF_OFFSET<4;4,1>:ub nDEST_YUV_REG*nGRFWIB:w // Initial Y,U,V offset in YUV422 block
// Pack 2nd field Y
$for(0; <nY_NUM_OF_ROWS; 1) {
mov (16) r[pCF_Y_OFFSET, %1*nGRFWIB]<2> ubRESP(nDI_PREV_FRAME_LUMA_OFFSET,%1*16)
}
// Pack 1st field Y
$for(0; <nY_NUM_OF_ROWS; 1) {
mov (16) r[pCF_Y_OFFSET, %1+4*nGRFWIB]<2> ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16)
}
// Pack 2nd field U
$for(0; <nUV_NUM_OF_ROWS; 1) {
mov (8) r[pCF_U_OFFSET, %1*nGRFWIB]<4> ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2> //U pixels
}
// Pack 1st field U
$for(0; <nUV_NUM_OF_ROWS; 1) {
mov (8) r[pCF_U_OFFSET, %1+4*nGRFWIB]<4> ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2> //U pixels
}
// Pack 2nd field V
$for(0; <nUV_NUM_OF_ROWS; 1) {
mov (8) r[pCF_V_OFFSET, %1*nGRFWIB]<4> ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16)<16;8,2> //Vpixels
}
// Packs1st field V
$for(0; <nUV_NUM_OF_ROWS; 1) {
mov (8) r[pCF_V_OFFSET, %1+4*nGRFWIB]<4> ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16)<16;8,2> //Vpixels
}
//save the previous frame
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
$for(0; <4; 1) {
mov (8) mudMSGPAYLOAD(%1)<1> udDEST_YUV(%1)REGION(8,1)
}
send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_DI+nBI_DESTINATION_1_YUV:ud
//save the current frame
mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud
$for(0; <4; 1) {
mov (8) mudMSGPAYLOAD(%1)<1> udDEST_YUV(%1+4)REGION(8,1)
}
send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_DI+nBI_DESTINATION_2_YUV:ud
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Module name: DI.inc
#ifdef GT
// GT DI Kernel
#else // ILK
// ILK DI Kernel
#endif
#include "undefall.inc"
//---------------------------------------------------------------------------
// Message descriptors
//---------------------------------------------------------------------------
// Extended message descriptor
// Message descriptor for sampler read
// // = 000 0010 (message len 2) 00000 (resp len - set later, 12 or 5 or 11)
// // 1 (header present 1) 0 11 (SIMD32/64 mode)
// // 1000 (message type) 0000 (DI state index)
// // 00000000 (binding table index - set later)
// // = 0x040b8000
#define nSMPL_DI_MSGDSC 0x040b8000
#define nSMPL_RESP_LEN_DNDI nRESLEN_12 // 12 - for DN + DI Alg
#define nSMPL_RESP_LEN_DN_PL nRESLEN_5 // 5 - for DN Planar Alg
#define nSMPL_RESP_LEN_DN_PA nRESLEN_9 // 9 - for DN Packed Alg
#define nSMPL_RESP_LEN_DI nRESLEN_9 // 9 - for DI Only Alg
#define nSMPL_RESP_LEN_PDI nRESLEN_11 // 11 - for Partial DI Alg
// Attention: The Message Length is The Number of GRFs with Data Only, without the Header
#define nDPMW_MSG_LEN_STMM nMSGLEN_1 // 1 - For STMM Save
#define nDPMW_MSG_LEN_HIST nMSGLEN_1 // 1 - For Denoise History Save
#define nDPMW_MSG_LEN_PA_DN_DI nMSGLEN_4 // 4 - For DN Curr Save
#define nDPMW_MSG_LEN_PA_DN_NODI nMSGLEN_8 // 8 - For DN Curr Save (denoise only - DI disabled)
#define nDPMW_MSG_LEN_PL_DN_DI nMSGLEN_2 // 2 - For DN Curr Save
#define nDPMW_MSG_LEN_PL_DN_NODI nMSGLEN_4 // 4 - For DN Curr Save (denoise only - DI disabled)
#define nDPW_BLOCK_SIZE_STMM nBLOCK_WIDTH_8+nBLOCK_HEIGHT_4 // Y block size 8x4
#undef nDPW_BLOCK_SIZE_DI
#undef nDPW_MSG_SIZE_DI
#define nDPW_BLOCK_SIZE_DI nBLOCK_WIDTH_32+nBLOCK_HEIGHT_4
#define nDPW_MSG_SIZE_DI nMSGLEN_4
//---------------------------------------------------------------------------
// Kernel GRF variables
//---------------------------------------------------------------------------
// Defines for DI enabled
#define nDI_PREV_FRAME_LUMA_OFFSET 0
#define nDI_PREV_FRAME_CHROMA_OFFSET 2
#define nDI_CURR_FRAME_LUMA_OFFSET 4
#define nDI_CURR_FRAME_CHROMA_OFFSET 6
#define nDI_STMM_OFFSET 8
#define nDI_HIST_OFFSET 9
#define nDI_CURR_2ND_FIELD_LUMA_OFFSET 10
#define nDI_CURR_2ND_FIELD_CHROMA_OFFSET 11
// Defines for DI disabled
#define nNODI_LUMA_OFFSET 0
#define nNODI_HIST_OFFSET 4
#define nNODI_CHROMA_OFFSET 5
#ifdef DI_ENABLE
#define nHIST_OFFSET nDI_HIST_OFFSET
#undef nY_NUM_OF_ROWS
#define nY_NUM_OF_ROWS 8 // Number of Y rows per block (4 rows for each frame)
#undef nUV_NUM_OF_ROWS
#define nUV_NUM_OF_ROWS 8 // Number of U/V rows per block
#endif
#ifdef DI_DISABLE
#define nHIST_OFFSET nNODI_HIST_OFFSET
#endif
#if (nSRC_REGION==nREGION_2)
#define ub2SRC_Y ub2BOT_Y
#define ub2SRC_U ub2BOT_U
#define ub2SRC_V ub2BOT_V
#define uwDEST_Y uwBOT_Y
#define uwDEST_U uwBOT_U
#define uwDEST_V uwBOT_V
#define nDEST_YUV_REG nTOP_Y
#define udDEST_YUV udTOP_Y_IO
#define nRESP nTEMP0 // DI return message requires 12 GRFs
#define nDN_YUV nTOP_Y // Space for Packing DN for next run requires 8 GRFs
#undef nSRC_REGION
#define nSRC_REGION nREGION_2
#else
#define ub2SRC_Y ub2TOP_Y
#define ub2SRC_U ub2TOP_U
#define ub2SRC_V ub2TOP_V
#define uwDEST_Y uwTOP_Y
#define uwDEST_U uwTOP_U
#define uwDEST_V uwTOP_V
#define nDEST_YUV_REG nBOT_Y
#define udDEST_YUV udBOT_Y_IO
#define nRESP nTEMP0 // DI return message requires 12 GRFs
#define nDN_YUV nBOT_Y // Space for Packing DN for next run requires 8 GRFs
#undef nSRC_REGION
#define nSRC_REGION nREGION_1 // REGION_1 will be the source region for first kernel
#endif
// Message response (Denoised & DI-ed pixels & statistics)
.declare udRESP Base=REG(r,nRESP) ElementSize=4 SrcRegion=REGION(8,1) DstRegion=<1> Type=ud
.declare ubRESP Base=REG(r,nRESP) ElementSize=1 SrcRegion=REGION(16,1) DstRegion=<1> Type=ub
// For Denoised Curr Output (Used as Priv in Next Run)
.declare ubDN_YUV Base=REG(r,nDN_YUV) ElementSize=1 Type=ub
.declare udDN_YUV Base=REG(r,nDN_YUV) ElementSize=4 Type=ud
#define npDN_YUV nDN_YUV*nGRFWIB
// For DI Process Output (1st and 2nd Frames Output)
//.declare udDI_YUV_PRIV Base=REG(r,nTEMP0) ElementSize=4 Type=ud // Previous frame DI output
//.declare udDI_YUV_CURR Base=REG(r,nTEMP0) ElementSize=4 Type=ud // Current frame DI output
//#define npDI_YUV nTEMP0*nGRFWIB
//---------------------------------------------------------------------------
// Kernel MRF variables
//---------------------------------------------------------------------------
#define mMSG_SMPL m1 // Sampler Command is in: m1~m2
.declare mudMSG_SMPL Base=mMSG_SMPL ElementSize=4 Type=ud
.declare muwMSG_SMPL Base=mMSG_SMPL ElementSize=2 Type=uw
#define mMSGHDR_DN m1 // Denoise Output: m1~m9 for PA, m3~m5 for PL
.declare mudMSGHDR_DN Base=mMSGHDR_DN ElementSize=4 Type=ud
.declare mubMSGHDR_DN Base=mMSGHDR_DN ElementSize=1 Type=ub
#define mMSGHDR_STMM m11 // STMM Output: m11~m12
.declare mudMSGHDR_STMM Base=mMSGHDR_STMM ElementSize=4 Type=ud
#define mMSGHDR_HIST m13 // HIST Output: m13~m14
.declare mudMSGHDR_HIST Base=mMSGHDR_HIST ElementSize=1 Type=ud
#define mMSGHDR_DI_1ST m1 // DI output: m1~m5
.declare mudMSGHDR_DI_1ST Base=mMSGHDR_DI_1ST ElementSize=4 Type=ud
#define mMSGHDR_DI_2ND m6 // DI output: m6~m10
.declare mudMSGHDR_DI_2ND Base=mMSGHDR_DI_2ND ElementSize=4 Type=ud
// end of DNDI.inc
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Activate the DNDI send command
mov (8) mudMSG_SMPL(0)<1> rMSGSRC.0<8;8,1>:ud NODDCLR // message header
mov (1) muwMSG_SMPL(1,4)<1> wORIX<0;1,0>:w NODDCLR_NODDCHK// horizontal origin
mov (1) muwMSG_SMPL(1,12)<1> wORIY<0;1,0>:w NODDCLR_NODDCHK // vertical origin
//mov (2) muwMSG_SMPL(1,4)<2> wORIX<2;2,1>:w NODDCHK// problem during compile !! when using this line
send (8) udRESP(0)<1> mMSG_SMPL udDUMMY_NULL nSMPL_ENGINE nSMPL_DI_MSGDSC+nSMPL_RESP_LEN+nBI_CURRENT_SRC_YUV_HW_DI:ud
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
// Write denoise history to memory
shr (2) rMSGSRC.0<1>:ud wORIX<2;2,1>:w 2:w NODDCLR // X,Y origin / 4
add (1) rMSGSRC.0<1>:ud rMSGSRC.0<0;1,0>:ud uwSPITCH_DIV2<0;1,0>:uw NODDCLR_NODDCHK// Add pitch to X origin
mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_HIST:ud NODDCHK // block width and height (4x2)
mov (8) mMSGHDR_HIST<1>:ud rMSGSRC.0<8;8,1>:ud // message header
mov (2) mudMSGHDR_HIST(1)<1> udRESP(nNODI_HIST_OFFSET,0)<2;2,1> // Move denoise history to MRF
send (8) dNULLREG mMSGHDR_HIST udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPMW_MSG_LEN_HIST+nBI_STMM_HISTORY_OUTPUT:ud
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
//---------- PA_AVS_IEF_16x8.asm ----------
#include "AVS_IEF.inc"
//------------------------------------------------------------------------------
// 2 sampler reads for 8x8 YUV packed
//------------------------------------------------------------------------------
#include "PA_AVS_IEF_Sample.asm"
//------------------------------------------------------------------------------
// Unpacking sampler reads to 4:4:4 internal planar
//------------------------------------------------------------------------------
#include "PA_AVS_IEF_Unpack_16x8.asm"
//------------------------------------------------------------------------------
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
//---------- PA_AVS_IEF_8x4.asm ----------
#include "AVS_IEF.inc"
//------------------------------------------------------------------------------
// 2 sampler reads for 8x8 YUV packed
//------------------------------------------------------------------------------
#include "PA_AVS_IEF_Sample.asm"
//------------------------------------------------------------------------------
// Unpacking sampler data to 4:2:0 internal planar
//------------------------------------------------------------------------------
#include "PA_AVS_IEF_Unpack_8x4.asm"
//------------------------------------------------------------------------------
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
//---------- PA_AVS_IEF_8x8.asm ----------
#include "AVS_IEF.inc"
//------------------------------------------------------------------------------
// 2 sampler reads for 8x8 YUV packed
//------------------------------------------------------------------------------
#include "PA_AVS_IEF_Sample.asm"
//------------------------------------------------------------------------------
// Unpacking sampler data to 4:2:2 internal planar
//------------------------------------------------------------------------------
#include "PA_AVS_IEF_Unpack_8x8.asm"
//------------------------------------------------------------------------------
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
//---------- PA_AVS_IEF_Sample.asm ----------
//------------------------------------------------------------------------------
// 2 sampler reads for 8x8 YUV packed
//------------------------------------------------------------------------------
// 1st 8x8 setup
#include "AVS_SetupFirstBlock.asm"
// Enable RGB(YUV) channels
mov (1) rAVS_8x8_HDR.2:ud nAVS_RGB_CHANNELS:ud
mov (16) mAVS_8x8_HDR.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs
send (1) uwAVS_RESPONSE(0)<1> mAVS_8x8_HDR udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_3CH+nSI_SRC_YUV+nBI_CURRENT_SRC_YUV
// Return YUV in 12 GRFs
// 2nd 8x8 setup
#include "AVS_SetupSecondBlock.asm"
mov (16) mAVS_8x8_HDR_2.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs
send (1) uwAVS_RESPONSE_2(0)<1> mAVS_8x8_HDR_2 udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_3CH+nSI_SRC_YUV+nBI_CURRENT_SRC_YUV
// Return YUV in 12 GRFs
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
//---------- PA_AVS_IEF_Unpack_8x8.asm ----------
// Yoni: In order to optimize unpacking, 3 methods are being checked:
// 1. AVS_ORIGINAL
// 2. AVS_ROUND_TO_8_BITS
// 3. AVS_INDIRECT_ACCESS
//
// Only 1 method should stay in the code
//#define AVS_ROUND_TO_8_BITS
//#define AVS_INDIRECT_ACCESS
// Move first 8x8 words of Y to dest GRF
mov (8) uwDEST_Y(0)<1> ubAVS_RESPONSE(2,1)<16;4,2>
mov (8) uwDEST_Y(1)<1> ubAVS_RESPONSE(2,8+1)<16;4,2>
mov (8) uwDEST_Y(2)<1> ubAVS_RESPONSE(3,1)<16;4,2>
mov (8) uwDEST_Y(3)<1> ubAVS_RESPONSE(3,8+1)<16;4,2>
mov (8) uwDEST_Y(4)<1> ubAVS_RESPONSE(8,1)<16;4,2>
mov (8) uwDEST_Y(5)<1> ubAVS_RESPONSE(8,8+1)<16;4,2>
mov (8) uwDEST_Y(6)<1> ubAVS_RESPONSE(9,1)<16;4,2>
mov (8) uwDEST_Y(7)<1> ubAVS_RESPONSE(9,8+1)<16;4,2>
// Move first 4x8 words of V to dest GRF
mov (4) uwDEST_V(0)<1> ubAVS_RESPONSE(0,1)<16;2,4>
mov (4) uwDEST_V(0,8)<1> ubAVS_RESPONSE(1,1)<16;2,4>
mov (4) uwDEST_V(1)<1> ubAVS_RESPONSE(6,1)<16;2,4>
mov (4) uwDEST_V(1,8)<1> ubAVS_RESPONSE(7,1)<16;2,4>
// Move first 4x8 words of U to dest GRF
mov (4) uwDEST_U(0)<1> ubAVS_RESPONSE(4,1)<16;2,4>
mov (4) uwDEST_U(0,8)<1> ubAVS_RESPONSE(5,1)<16;2,4>
mov (4) uwDEST_U(1)<1> ubAVS_RESPONSE(10,1)<16;2,4>
mov (4) uwDEST_U(1,8)<1> ubAVS_RESPONSE(11,1)<16;2,4>
// Move second 8x8 words of Y to dest GRF
mov (8) uwDEST_Y(0,8)<1> ubAVS_RESPONSE_2(2,1)<16;4,2>
mov (8) uwDEST_Y(1,8)<1> ubAVS_RESPONSE_2(2,8+1)<16;4,2>
mov (8) uwDEST_Y(2,8)<1> ubAVS_RESPONSE_2(3,1)<16;4,2>
mov (8) uwDEST_Y(3,8)<1> ubAVS_RESPONSE_2(3,8+1)<16;4,2>
mov (8) uwDEST_Y(4,8)<1> ubAVS_RESPONSE_2(8,1)<16;4,2>
mov (8) uwDEST_Y(5,8)<1> ubAVS_RESPONSE_2(8,8+1)<16;4,2>
mov (8) uwDEST_Y(6,8)<1> ubAVS_RESPONSE_2(9,1)<16;4,2>
mov (8) uwDEST_Y(7,8)<1> ubAVS_RESPONSE_2(9,8+1)<16;4,2>
// Move second 4x8 words of V to dest GRF
mov (4) uwDEST_V(0,4)<1> ubAVS_RESPONSE_2(0,1)<16;2,4>
mov (4) uwDEST_V(0,12)<1> ubAVS_RESPONSE_2(1,1)<16;2,4>
mov (4) uwDEST_V(1,4)<1> ubAVS_RESPONSE_2(6,1)<16;2,4>
mov (4) uwDEST_V(1,12)<1> ubAVS_RESPONSE_2(7,1)<16;2,4>
// Move second 4x8 words of U to dest GRF
mov (4) uwDEST_U(0,4)<1> ubAVS_RESPONSE_2(4,1)<16;2,4>
mov (4) uwDEST_U(0,12)<1> ubAVS_RESPONSE_2(5,1)<16;2,4>
mov (4) uwDEST_U(1,4)<1> ubAVS_RESPONSE_2(10,1)<16;2,4>
mov (4) uwDEST_U(1,12)<1> ubAVS_RESPONSE_2(11,1)<16;2,4>
//------------------------------------------------------------------------------
// Re-define new number of lines
#undef nUV_NUM_OF_ROWS
#undef nY_NUM_OF_ROWS
#define nY_NUM_OF_ROWS 8
#define nUV_NUM_OF_ROWS 8
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
//---------- PA_AVS_IEF_Unpack_8x8.asm ----------
// Yoni: In order to optimize unpacking, 3 methods are being checked:
// 1. AVS_ORIGINAL
// 2. AVS_ROUND_TO_8_BITS
// 3. AVS_INDIRECT_ACCESS
//
// Only 1 method should stay in the code
//#define AVS_ROUND_TO_8_BITS
//#define AVS_INDIRECT_ACCESS
// Move first 8x8 words of Y to dest GRF
mov (8) uwDEST_Y(0)<1> ubAVS_RESPONSE(2,1)<16;4,2>
mov (8) uwDEST_Y(1)<1> ubAVS_RESPONSE(2,8+1)<16;4,2>
mov (8) uwDEST_Y(2)<1> ubAVS_RESPONSE(3,1)<16;4,2>
mov (8) uwDEST_Y(3)<1> ubAVS_RESPONSE(3,8+1)<16;4,2>
mov (8) uwDEST_Y(4)<1> ubAVS_RESPONSE(8,1)<16;4,2>
mov (8) uwDEST_Y(5)<1> ubAVS_RESPONSE(8,8+1)<16;4,2>
mov (8) uwDEST_Y(6)<1> ubAVS_RESPONSE(9,1)<16;4,2>
mov (8) uwDEST_Y(7)<1> ubAVS_RESPONSE(9,8+1)<16;4,2>
// Move first 4x8 words of V to dest GRF
mov (4) uwDEST_V(0)<1> ubAVS_RESPONSE(0,1)<16;2,4>
mov (4) uwDEST_V(0,8)<1> ubAVS_RESPONSE(0,8+1)<16;2,4>
mov (4) uwDEST_V(1)<1> ubAVS_RESPONSE(1,1)<16;2,4>
mov (4) uwDEST_V(1,8)<1> ubAVS_RESPONSE(1,8+1)<16;2,4>
mov (4) uwDEST_V(2)<1> ubAVS_RESPONSE(6,1)<16;2,4>
mov (4) uwDEST_V(2,8)<1> ubAVS_RESPONSE(6,8+1)<16;2,4>
mov (4) uwDEST_V(3)<1> ubAVS_RESPONSE(7,1)<16;2,4>
mov (4) uwDEST_V(3,8)<1> ubAVS_RESPONSE(7,8+1)<16;2,4>
// Move first 4x8 words of U to dest GRF
mov (4) uwDEST_U(0)<1> ubAVS_RESPONSE(4,1)<16;2,4>
mov (4) uwDEST_U(0,8)<1> ubAVS_RESPONSE(4,8+1)<16;2,4>
mov (4) uwDEST_U(1)<1> ubAVS_RESPONSE(5,1)<16;2,4>
mov (4) uwDEST_U(1,8)<1> ubAVS_RESPONSE(5,8+1)<16;2,4>
mov (4) uwDEST_U(2)<1> ubAVS_RESPONSE(10,1)<16;2,4>
mov (4) uwDEST_U(2,8)<1> ubAVS_RESPONSE(10,8+1)<16;2,4>
mov (4) uwDEST_U(3)<1> ubAVS_RESPONSE(11,1)<16;2,4>
mov (4) uwDEST_U(3,8)<1> ubAVS_RESPONSE(11,8+1)<16;2,4>
// Move second 8x8 words of Y to dest GRF
mov (8) uwDEST_Y(0,8)<1> ubAVS_RESPONSE_2(2,1)<16;4,2>
mov (8) uwDEST_Y(1,8)<1> ubAVS_RESPONSE_2(2,8+1)<16;4,2>
mov (8) uwDEST_Y(2,8)<1> ubAVS_RESPONSE_2(3,1)<16;4,2>
mov (8) uwDEST_Y(3,8)<1> ubAVS_RESPONSE_2(3,8+1)<16;4,2>
mov (8) uwDEST_Y(4,8)<1> ubAVS_RESPONSE_2(8,1)<16;4,2>
mov (8) uwDEST_Y(5,8)<1> ubAVS_RESPONSE_2(8,8+1)<16;4,2>
mov (8) uwDEST_Y(6,8)<1> ubAVS_RESPONSE_2(9,1)<16;4,2>
mov (8) uwDEST_Y(7,8)<1> ubAVS_RESPONSE_2(9,8+1)<16;4,2>
// Move second 4x8 words of V to dest GRF
mov (4) uwDEST_V(0,4)<1> ubAVS_RESPONSE_2(0,1)<16;2,4>
mov (4) uwDEST_V(0,12)<1> ubAVS_RESPONSE_2(0,8+1)<16;2,4>
mov (4) uwDEST_V(1,4)<1> ubAVS_RESPONSE_2(1,1)<16;2,4>
mov (4) uwDEST_V(1,12)<1> ubAVS_RESPONSE_2(1,8+1)<16;2,4>
mov (4) uwDEST_V(2,4)<1> ubAVS_RESPONSE_2(6,1)<16;2,4>
mov (4) uwDEST_V(2,12)<1> ubAVS_RESPONSE_2(6,8+1)<16;2,4>
mov (4) uwDEST_V(3,4)<1> ubAVS_RESPONSE_2(7,1)<16;2,4>
mov (4) uwDEST_V(3,12)<1> ubAVS_RESPONSE_2(7,8+1)<16;2,4>
// Move second 4x8 words of U to dest GRF
mov (4) uwDEST_U(0,4)<1> ubAVS_RESPONSE_2(4,1)<16;2,4>
mov (4) uwDEST_U(0,12)<1> ubAVS_RESPONSE_2(4,8+1)<16;2,4>
mov (4) uwDEST_U(1,4)<1> ubAVS_RESPONSE_2(5,1)<16;2,4>
mov (4) uwDEST_U(1,12)<1> ubAVS_RESPONSE_2(5,8+1)<16;2,4>
mov (4) uwDEST_U(2,4)<1> ubAVS_RESPONSE_2(10,1)<16;2,4>
mov (4) uwDEST_U(2,12)<1> ubAVS_RESPONSE_2(10,8+1)<16;2,4>
mov (4) uwDEST_U(3,4)<1> ubAVS_RESPONSE_2(11,1)<16;2,4>
mov (4) uwDEST_U(3,12)<1> ubAVS_RESPONSE_2(11,8+1)<16;2,4>
//------------------------------------------------------------------------------
// Re-define new number of lines
#undef nUV_NUM_OF_ROWS
#undef nY_NUM_OF_ROWS
#define nY_NUM_OF_ROWS 8
#define nUV_NUM_OF_ROWS 8
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
#define DI_ENABLE
#include "DNDI.inc"
#ifdef DI_ONLY
#undef nSMPL_RESP_LEN
#define nSMPL_RESP_LEN nSMPL_RESP_LEN_DI // set the number of GRF
#else
#undef nSMPL_RESP_LEN
#define nSMPL_RESP_LEN nSMPL_RESP_LEN_DNDI // set the number of GRF
#endif
#undef nDPW_BLOCK_SIZE_HIST
#define nDPW_BLOCK_SIZE_HIST nBLOCK_WIDTH_4+nBLOCK_HEIGHT_1 // HIST Block Size for Write is 4x2
#undef nDPW_BLOCK_SIZE_DN
#define nDPW_BLOCK_SIZE_DN nBLOCK_WIDTH_32+nBLOCK_HEIGHT_4 // DN Block Size for Write is 32x4
////////////////////////////////////// Run the DN Algorithm ///////////////////////////////////////
#include "DNDI_Command.asm"
////////////////////////////////////// Rearrange for Internal Planar //////////////////////////////
//// move the previous frame Y component to internal planar format
//$for (0; <nY_NUM_OF_ROWS/2; 1) {
// mov (16) uwDEST_Y(%1,0)<1> ubRESP(nDI_PREV_FRAME_LUMA_OFFSET,%1*16)
//}
//// move the previous frame U,V components to internal planar format
//$for (0; <nUV_NUM_OF_ROWS/2; 1) {
// mov (8) uwDEST_U(0,%1*8)<1> ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2> //U pixels
// mov (8) uwDEST_V(0,%1*8)<1> ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16)<16;8,2> //V pixels
//}
//// move the current frame Y component to internal planar format
//$for (0; <nY_NUM_OF_ROWS/2; 1) {
// mov (16) uwDEST_Y(%1+4,0)<1> ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16)
//}
//// move the current frame U,V components to internal planar format
//$for (0; <nUV_NUM_OF_ROWS/2; 1) {
// mov (8) uwDEST_U(2,%1*8)<1> ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2> //U pixels
// mov (8) uwDEST_V(2,%1*8)<1> ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16)<16;8,2> //V pixels
//}
////////////////////////////////////// Save the STMM Data for Next Run /////////////////////////
// Write STMM to memory
shr (1) rMSGSRC.0<1>:ud wORIX<0;1,0>:w 1:w NODDCLR // X origin / 2
mov (1) rMSGSRC.1<1>:ud wORIY<0;1,0>:w NODDCLR_NODDCHK // Y origin
mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_STMM:ud NODDCHK // block width and height (8x4)
mov (8) mudMSGHDR_STMM(0)<1> rMSGSRC.0<8;8,1>:ud // message header
mov (8) mudMSGHDR_STMM(1)<1> udRESP(nDI_STMM_OFFSET,0) // Move STMM to MRF
send (8) dNULLREG mMSGHDR_STMM udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPMW_MSG_LEN_STMM+nBI_STMM_HISTORY_OUTPUT:ud
////////////////////////////////////// Save the History Data for Next Run /////////////////////////
#ifdef DI_ONLY
#else
#include "DI_Hist_Save.asm"
////////////////////////////////////// Pack and Save the DN Curr Frame for Next Run ///////////////
// check top/bottom field first
cmp.e.f0.0 (1) null<1>:w ubTFLD_FIRST<0;1,0>:ub 1:w
add (4) pCF_Y_OFFSET<1>:uw ubSRC_CF_OFFSET<4;4,1>:ub npDN_YUV:uw
//set the save DN position
shl (1) rMSGSRC.0<1>:ud wORIX<0;1,0>:w 1:w NODDCLR // X origin * 2
mov (1) rMSGSRC.1<1>:ud wORIY<0;1,0>:w NODDCLR_NODDCHK // Y origin
mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_DN:ud NODDCHK // block width and height (8x4)
mov (8) mudMSGHDR_DN(0)<1> rMSGSRC.0<8;8,1>:ud
(f0.0) jmpi (1) TOP_FIELD_FIRST
BOTTOM_FIELD_FIRST:
//$for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
// mov (16) r[pCF_Y_OFFSET, %1*32]<2>:ub ubRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*16) // 2nd field luma from current frame (line 0,2)
// mov (16) r[pCF_Y_OFFSET, %1+1*32]<2>:ub ubRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,16) // 1st field luma from current frame (line 1,3)
// mov (8) r[pCF_U_OFFSET, %1*32]<4>:ub ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16+1)<16;8,2> // 2nd field U from current frame (line 0,2)
// mov (8) r[pCF_V_OFFSET, %1*32]<4>:ub ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16)<16;8,2> // 2nd field V from current frame (line 0,2)
// mov (8) r[pCF_U_OFFSET, %1+1*32]<4>:ub ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,16+1)<16;8,2> // 1st field U from current frame (line 1,3)
// mov (8) r[pCF_V_OFFSET, %1+1*32]<4>:ub ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,16)<16;8,2> // 1st field U from current frame (line 1,3)
//}
$for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
mov (16) r[pCF_Y_OFFSET, %1*32]<2>:ub ubRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*16) // 2nd field luma from current frame (line 0,2)
mov (16) r[pCF_Y_OFFSET, %1+1*32]<2>:ub ubRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,16) // 1st field luma from current frame (line 1,3)
}
$for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
mov (8) r[pCF_U_OFFSET, %1*32]<4>:ub ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16+1)<16;8,2> // 2nd field U from current frame (line 0,2)
mov (8) r[pCF_U_OFFSET, %1+1*32]<4>:ub ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,16+1)<16;8,2> // 1st field U from current frame (line 1,3)
}
$for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
mov (8) r[pCF_V_OFFSET, %1*32]<4>:ub ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16)<16;8,2> // 2nd field V from current frame (line 0,2)
mov (8) r[pCF_V_OFFSET, %1+1*32]<4>:ub ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,16)<16;8,2> // 1st field U from current frame (line 1,3)
}
jmpi (1) SAVE_DN_CURR
TOP_FIELD_FIRST:
//$for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
// mov (16) r[pCF_Y_OFFSET, %1*32]<2>:ub ubRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0) // 1st field luma from current frame (line 0,2)
// mov (16) r[pCF_Y_OFFSET, %1+1*32]<2>:ub ubRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*16) // 2nd field luma from current frame (line 1,3)
// mov (8) r[pCF_U_OFFSET, %1*32]<4>:ub ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,1)<16;8,2> // 1st field U from current frame (line 0,2)
// mov (8) r[pCF_V_OFFSET, %1*32]<4>:ub ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,0)<16;8,2> // 1st field V from current frame (line 0,2)
// mov (8) r[pCF_U_OFFSET, %1+1*32]<4>:ub ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16+1)<16;8,2> // 2nd field U from current frame (line 1,3)
// mov (8) r[pCF_V_OFFSET, %1+1*32]<4>:ub ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16)<16;8,2> // 2nd field V from current frame (line 1,3)
//}
$for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
mov (16) r[pCF_Y_OFFSET, %1*32]<2>:ub ubRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0) // 1st field luma from current frame (line 0,2)
mov (16) r[pCF_Y_OFFSET, %1+1*32]<2>:ub ubRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*16) // 2nd field luma from current frame (line 1,3)
}
$for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
mov (8) r[pCF_U_OFFSET, %1*32]<4>:ub ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,1)<16;8,2> // 1st field U from current frame (line 0,2)
mov (8) r[pCF_U_OFFSET, %1+1*32]<4>:ub ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16+1)<16;8,2> // 2nd field U from current frame (line 1,3)
}
$for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
mov (8) r[pCF_V_OFFSET, %1*32]<4>:ub ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,0)<16;8,2> // 1st field V from current frame (line 0,2)
mov (8) r[pCF_V_OFFSET, %1+1*32]<4>:ub ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16)<16;8,2> // 2nd field V from current frame (line 1,3)
}
SAVE_DN_CURR:
$for(0; <nY_NUM_OF_ROWS/2; 1) {
mov (8) mudMSGHDR_DN(%1+1)<1> udDN_YUV(%1)REGION(8,1)
}
send (8) dNULLREG mMSGHDR_DN udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPMW_MSG_LEN_PA_DN_DI+nBI_DESTINATION_YUV:ud
#endif
// Save Processed frames
#include "DI_Save_PA.asm"
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
#define DI_DISABLE
#include "DNDI.inc"
#undef nY_NUM_OF_ROWS
#define nY_NUM_OF_ROWS 8 // Number of Y rows per block
#undef nUV_NUM_OF_ROWS
#define nUV_NUM_OF_ROWS 8 // Number of U/V rows per block
#undef nSMPL_RESP_LEN
#define nSMPL_RESP_LEN nSMPL_RESP_LEN_DN_PA // Set the Number of GRFs in DNDI response
#undef nDPW_BLOCK_SIZE_DN
#define nDPW_BLOCK_SIZE_DN nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8 // DN Curr Block Size for Write is 32x8
#undef nDPW_BLOCK_SIZE_HIST
#define nDPW_BLOCK_SIZE_HIST nBLOCK_WIDTH_4+nBLOCK_HEIGHT_2 // HIST Block Size for Write is 4x2
////////////////////////////////////// Run the DN Algorithm ///////////////////////////////////////
#include "DNDI_COMMAND.asm"
////////////////////////////////////// Save the History Data for Next Run /////////////////////////
#include "DNDI_Hist_Save.asm"
////////////////////////////////////// Pack and Save the DN Curr Frame for Next Run ///////////////
add (4) pCF_Y_OFFSET<1>:uw ubDEST_CF_OFFSET<4;4,1>:ub npDN_YUV:w
$for (0; <nY_NUM_OF_ROWS; 1) {
mov (16) r[pCF_Y_OFFSET, %1*32]<2>:ub ubRESP(nNODI_LUMA_OFFSET,%1*16)<16;16,1> // copy line of Y
}
$for (0; <nUV_NUM_OF_ROWS; 1) {
mov (8) r[pCF_U_OFFSET, %1*32]<4>:ub ubRESP(nNODI_CHROMA_OFFSET,%1*16+1)<16;8,2> // copy line of U
mov (8) r[pCF_V_OFFSET, %1*32]<4>:ub ubRESP(nNODI_CHROMA_OFFSET,%1*16)<16;8,2> // copy line of V
}
shl (1) rMSGSRC.0<1>:ud wORIX<0;1,0>:w 1:w // X origin * 2 (422 output)
mov (1) rMSGSRC.1<1>:ud wORIY<0;1,0>:w // Y origin
mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_DN:ud // block width and height (32x8)
mov (8) mMSGHDR_DN<1>:ud rMSGSRC<8;8,1>:ud // message header
$for(0; <nY_NUM_OF_ROWS; 2) {
mov (16) mudMSGHDR_DN(1+%1)<1> udDN_YUV(%1)REGION(8,1) // Move DN Curr to MRF
}
send (8) dNULLREG mMSGHDR_DN udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPMW_MSG_LEN_PA_DN_NODI+nBI_DESTINATION_YUV:ud
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
//---------- PA_Scaling.asm ----------
#include "Scaling.inc"
// Build 16 elements ramp in float32 and normalized it
// mov (8) SAMPLER_RAMP(0)<1> 0x76543210:v
// add (8) SAMPLER_RAMP(1)<1> SAMPLER_RAMP(0) 8.0:f
mov (4) SAMPLER_RAMP(0)<1> 0x48403000:vf //3, 2, 1, 0 in float vector
mov (4) SAMPLER_RAMP(0,4)<1> 0x5C585450:vf //7, 6, 5, 4 in float vector
add (8) SAMPLER_RAMP(1)<1> SAMPLER_RAMP(0) 8.0:f
//Module: PrepareScaleCoord.asm
// Setup for sampler msg hdr
mov (2) rMSGSRC.0<1>:ud 0:ud { NoDDClr } // Unused fields
mov (1) rMSGSRC.2<1>:ud 0:ud { NoDDChk } // Write and offset
// Calculate 16 v based on the step Y and vertical origin
mov (16) mfMSGPAYLOAD(2)<1> fSRC_VID_V_ORI<0;1,0>:f
mov (16) SCALE_COORD_Y<1>:f fSRC_VID_V_ORI<0;1,0>:f
// Calculate 16 u based on the step X and hori origin
// line (16) mfMSGPAYLOAD(0)<1> SCALE_STEP_X<0;1,0>:f SAMPLER_RAMP(0) // Assign to mrf directly
mov (16) acc0:f fSRC_VID_H_ORI<0;1,0>:f { Compr }
mac (16) mfMSGPAYLOAD(0)<1> fVIDEO_STEP_X<0;1,0>:f SAMPLER_RAMP(0) { Compr }
//Setup the constants for line instruction
mov (1) SCALE_LINE_P255<1>:f 255.0:f { NoDDClr } //{ NoDDClr, NoDDChk }
mov (1) SCALE_LINE_P0_5<1>:f 0.5:f { NoDDChk }
//------------------------------------------------------------------------------
$for (0; <nY_NUM_OF_ROWS; 1) {
// Read 16 sampled pixels and store them in float32 in 8 GRFs in the order of BGRA (VYUA).
mov (8) MSGHDR_SCALE.0:ud rMSGSRC.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs
send (16) SCALE_RESPONSE_YW(0)<1> MSGHDR_SCALE udDUMMY_NULL nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_YUV+nBI_CURRENT_SRC_YUV
// Calculate 16 v for next line
add (16) mfMSGPAYLOAD(2)<1> SCALE_COORD_Y<8;8,1>:f fVIDEO_STEP_Y<0;1,0>:f // Assign to mrf directly
add (16) SCALE_COORD_Y<1>:f SCALE_COORD_Y<8;8,1>:f fVIDEO_STEP_Y<0;1,0>:f // Assign to mrf directly
// Scale back to [0, 255], convert f to ud
line (16) acc0:f SCALE_LINE_P255<0;1,0>:f SCALE_RESPONSE_YF(0) { Compr } // Process B, V
mov (16) SCALE_RESPONSE_YD(0)<1> acc0:f { Compr }
line (16) acc0:f SCALE_LINE_P255<0;1,0>:f SCALE_RESPONSE_YF(2) { Compr } // Process B, V
mov (16) SCALE_RESPONSE_YD(2)<1> acc0:f { Compr }
line (16) acc0:f SCALE_LINE_P255<0;1,0>:f SCALE_RESPONSE_YF(4) { Compr } // Process B, V
mov (16) SCALE_RESPONSE_YD(4)<1> acc0:f { Compr }
mov (16) DEST_V(%1)<1> SCALE_RESPONSE_YB(0) //possible error due to truncation - vK
mov (16) DEST_Y(%1)<1> SCALE_RESPONSE_YB(2) //possible error due to truncation - vK
mov (16) DEST_U(%1)<1> SCALE_RESPONSE_YB(4) //possible error due to truncation - vK
}
#define nSRC_REGION nREGION_1
//------------------------------------------------------------------------------
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
//---------- PL2_AVS_IEF_16x8.asm ----------
#include "AVS_IEF.inc"
//------------------------------------------------------------------------------
// 2 sampler reads for 8x8 Y each
// 2 sampler read for 8x8 U and 8x8 V (NV11\P208 input surface)
//------------------------------------------------------------------------------
// 1st 8x8 setup
#include "AVS_SetupFirstBlock.asm"
// Enable green channel only
mov (1) rAVS_8x8_HDR.2:ud nAVS_GREEN_CHANNEL_ONLY:ud
mov (16) mAVS_8x8_HDR.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs
send (1) uwAVS_RESPONSE(0)<1> mAVS_8x8_HDR udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
// Return Y in 4 GRFs
// 8x8 U and V sampling
// Enable red and blue channels
mov (1) rAVS_8x8_HDR.2:ud nAVS_RED_BLUE_CHANNELS:ud
mov (16) mAVS_8x8_HDR_UV.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs
send (1) uwAVS_RESPONSE(4)<1> mAVS_8x8_HDR_UV udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_2CH+nSI_SRC_UV+nBI_CURRENT_SRC_UV
// Return U and V in 8 GRFs
// 2nd 8x8 setup
#include "AVS_SetupSecondBlock.asm"
// 2nd 8x8 Y sampling
// Enable green channel only
mov (1) rAVS_8x8_HDR.2:ud nAVS_GREEN_CHANNEL_ONLY:ud
mov (16) mAVS_8x8_HDR.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs
send (1) uwAVS_RESPONSE_2(0)<1> mAVS_8x8_HDR udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
// 2nd 8x8 U and V sampling
// Enable red and blue channels
mov (1) rAVS_8x8_HDR.2:ud nAVS_RED_BLUE_CHANNELS:ud
mov (16) mAVS_8x8_HDR_UV.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs
send (1) uwAVS_RESPONSE_2(4)<1> mAVS_8x8_HDR_UV udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_2CH+nSI_SRC_UV+nBI_CURRENT_SRC_UV
// Return U and V in 8 GRFs
//------------------------------------------------------------------------------
// Unpacking sampler reads to 4:4:4 internal planar
//------------------------------------------------------------------------------
#include "PL2_AVS_IEF_Unpack_16x8.asm"
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
//---------- PL2_AVS_IEF_8x4.asm ----------
#include "AVS_IEF.inc"
//------------------------------------------------------------------------------
// 2 sampler reads for 8x8 Y each
// 1 sampler read for 8x8 U and 8x8 V (NV11\NV12 input surface)
//------------------------------------------------------------------------------
// 1st 8x8 setup
#include "AVS_SetupFirstBlock.asm"
// Enable green channel only
mov (1) rAVS_8x8_HDR.2:ud nAVS_GREEN_CHANNEL_ONLY:ud
mov (16) mAVS_8x8_HDR.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs
send (1) uwAVS_RESPONSE(0)<1> mAVS_8x8_HDR udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
// Return Y in 4 GRFs
// 8x8 U and V sampling
// Enable red and blue channels
//Only 8x4 wil be used
mov (1) rAVS_8x8_HDR.2:ud nAVS_RED_BLUE_CHANNELS:ud
// Calculate Chroma Step Size:
// for H direction: 16 Luma samples are covered by 8 Chroma samples. Thus Chroma_Step_X = 2 * Luma_Step_X
// for V direction: 8 Luma samples are covered by 8 Chroma samples. Thus Chroma_Step_Y = Luma_Step_Y
mul (1) rAVS_PAYLOAD.1:f fVIDEO_STEP_X:f 2.0:f // Step X for chroma
mov (16) mAVS_8x8_HDR_UV.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs
send (1) uwAVS_RESPONSE(4)<1> mAVS_8x8_HDR_UV udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_2CH+nSI_SRC_UV+nBI_CURRENT_SRC_UV
// Return U and V in 8 GRFs
// 2nd 8x8 setup
#include "AVS_SetupSecondBlock.asm"
// 2nd 8x8 Y sampling
// Enable green channel only
mov (1) rAVS_8x8_HDR.2:ud nAVS_GREEN_CHANNEL_ONLY:ud
mov (16) mAVS_8x8_HDR.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs
send (1) uwAVS_RESPONSE_2(0)<1> mAVS_8x8_HDR udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
//------------------------------------------------------------------------------
// Unpacking sampler reads to 4:2:0 internal planar
//------------------------------------------------------------------------------
#include "PL2_AVS_IEF_Unpack_8x4.asm"
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
//---------- PL2_AVS_IEF_8x8.asm ----------
#include "AVS_IEF.inc"
//------------------------------------------------------------------------------
// 2 sampler reads for 8x8 Y each
// 1 sampler read for 8x8 U and 8x8 V (NV11\NV12 input surface)
//------------------------------------------------------------------------------
// 1st 8x8 setup
#include "AVS_SetupFirstBlock.asm"
// Enable green channel only
mov (1) rAVS_8x8_HDR.2:ud nAVS_GREEN_CHANNEL_ONLY:ud
mov (16) mAVS_8x8_HDR.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs
send (1) uwAVS_RESPONSE(0)<1> mAVS_8x8_HDR udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
// Return Y in 4 GRFs
// 8x8 U and V sampling
// Enable red and blue channels
mov (1) rAVS_8x8_HDR.2:ud nAVS_RED_BLUE_CHANNELS:ud
// Calculate Chroma Step Size:
// for H direction: 16 Luma samples are covered by 8 Chroma samples. Thus Chroma_Step_X = 2 * Luma_Step_X
// for V direction: 8 Luma samples are covered by 8 Chroma samples. Thus Chroma_Step_Y = Luma_Step_Y
mul (1) rAVS_PAYLOAD.1:f fVIDEO_STEP_X:f 2.0:f // Step X for chroma
mov (16) mAVS_8x8_HDR_UV.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs
send (1) uwAVS_RESPONSE(4)<1> mAVS_8x8_HDR_UV udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_2CH+nSI_SRC_UV+nBI_CURRENT_SRC_UV
// Return U and V in 8 GRFs
// 2nd 8x8 setup
#include "AVS_SetupSecondBlock.asm"
// 2nd 8x8 Y sampling
// Enable green channel only
mov (1) rAVS_8x8_HDR.2:ud nAVS_GREEN_CHANNEL_ONLY:ud
mov (16) mAVS_8x8_HDR.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs
send (1) uwAVS_RESPONSE_2(0)<1> mAVS_8x8_HDR udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
//------------------------------------------------------------------------------
// Unpacking sampler reads to 4:2:2 internal planar
//------------------------------------------------------------------------------
#include "PL2_AVS_IEF_Unpack_8x8.asm"
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
//---------- PL2_AVS_IEF_8x4.asm ----------
// Move first 8x8 words of Y to dest GRF at lower 8 words of each RGF.
$for(0; <8/2; 1) {
mov (8) uwDEST_Y(%1*2)<1> ubAVS_RESPONSE(%1,1)<16;4,2> // Copy high byte in a word
mov (8) uwDEST_Y(%1*2+1)<1> ubAVS_RESPONSE(%1,8+1)<16;4,2> // Copy high byte in a word
}
// Move 8x4 words of U to dest GRF (Copy high byte in a word)
mov (8) uwDEST_U(0)<1> ubAVS_RESPONSE(4,1)<16;4,2>
mov (8) uwDEST_U(0,8)<1> ubAVS_RESPONSE(5,1)<16;4,2>
mov (8) uwDEST_U(1)<1> ubAVS_RESPONSE(8,1)<16;4,2>
mov (8) uwDEST_U(1,8)<1> ubAVS_RESPONSE(9,1)<16;4,2>
// Move 8x4 words of V to dest GRF
mov (8) uwDEST_V(0)<1> ubAVS_RESPONSE(6,1)<16;4,2>
mov (8) uwDEST_V(0,8)<1> ubAVS_RESPONSE(7,1)<16;4,2>
mov (8) uwDEST_V(1)<1> ubAVS_RESPONSE(10,1)<16;4,2>
mov (8) uwDEST_V(1,8)<1> ubAVS_RESPONSE(11,1)<16;4,2>
// Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each GRF.
$for(0; <8/2; 1) {
mov (8) uwDEST_Y(%1*2,8)<1> ubAVS_RESPONSE_2(%1,1)<16;4,2> // Copy high byte in a word
mov (8) uwDEST_Y(%1*2+1,8)<1> ubAVS_RESPONSE_2(%1,8+1)<16;4,2> // Copy high byte in a word
}
//------------------------------------------------------------------------------
// Re-define new # of lines
#undef nUV_NUM_OF_ROWS
#undef nY_NUM_OF_ROWS
#define nY_NUM_OF_ROWS 8
#define nUV_NUM_OF_ROWS 4
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
//---------- PL2_AVS_IEF_8x8.asm ----------
// Move first 8x8 words of Y to dest GRF at lower 8 words of each RGF.
$for(0; <8/2; 1) {
mov (8) uwDEST_Y(%1*2)<1> ubAVS_RESPONSE(%1,1)<16;4,2> // Copy high byte in a word
mov (8) uwDEST_Y(%1*2+1)<1> ubAVS_RESPONSE(%1,8+1)<16;4,2> // Copy high byte in a word
}
// Move 8x8 words of U to dest GRF (Copy high byte in a word)
mov (8) uwDEST_U(0)<1> ubAVS_RESPONSE(4,1)<16;4,2>
mov (8) uwDEST_U(0,8)<1> ubAVS_RESPONSE(4,8+1)<16;4,2>
mov (8) uwDEST_U(1)<1> ubAVS_RESPONSE(5,1)<16;4,2>
mov (8) uwDEST_U(1,8)<1> ubAVS_RESPONSE(5,8+1)<16;4,2>
mov (8) uwDEST_U(2)<1> ubAVS_RESPONSE(8,1)<16;4,2>
mov (8) uwDEST_U(2,8)<1> ubAVS_RESPONSE(8,8+1)<16;4,2>
mov (8) uwDEST_U(3)<1> ubAVS_RESPONSE(9,1)<16;4,2>
mov (8) uwDEST_U(3,8)<1> ubAVS_RESPONSE(9,8+1)<16;4,2>
// Move 8x8 words of V to dest GRF
mov (8) uwDEST_V(0)<1> ubAVS_RESPONSE(6,1)<16;4,2>
mov (8) uwDEST_V(0,8)<1> ubAVS_RESPONSE(6,8+1)<16;4,2>
mov (8) uwDEST_V(1)<1> ubAVS_RESPONSE(7,1)<16;4,2>
mov (8) uwDEST_V(1,8)<1> ubAVS_RESPONSE(7,8+1)<16;4,2>
mov (8) uwDEST_V(2)<1> ubAVS_RESPONSE(10,1)<16;4,2>
mov (8) uwDEST_V(2,8)<1> ubAVS_RESPONSE(10,8+1)<16;4,2>
mov (8) uwDEST_V(3)<1> ubAVS_RESPONSE(11,1)<16;4,2>
mov (8) uwDEST_V(3,8)<1> ubAVS_RESPONSE(11,8+1)<16;4,2>
// Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each GRF.
$for(0; <8/2; 1) {
mov (8) uwDEST_Y(%1*2,8)<1> ubAVS_RESPONSE_2(%1,1)<16;4,2> // Copy high byte in a word
mov (8) uwDEST_Y(%1*2+1,8)<1> ubAVS_RESPONSE_2(%1,8+1)<16;4,2> // Copy high byte in a word
}
//------------------------------------------------------------------------------
// Re-define new # of lines
#undef nUV_NUM_OF_ROWS
#undef nY_NUM_OF_ROWS
#define nY_NUM_OF_ROWS 8
#define nUV_NUM_OF_ROWS 8
/*
* All Video Processing kernels
* Copyright © <2010>, Intel Corporation.
*
* This program is licensed under the terms and conditions of the
* Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
* http://www.opensource.org/licenses/eclipse-1.0.php.
*
*/
//---------- PL2_Scaling.asm ----------
#include "Scaling.inc"
// Build 16 elements ramp in float32 and normalized it
// mov (8) SAMPLER_RAMP(0)<1> 0x76543210:v
// add (8) SAMPLER_RAMP(1)<1> SAMPLER_RAMP(0) 8.0:f
mov (4) SAMPLER_RAMP(0)<1> 0x48403000:vf //3, 2, 1, 0 in float vector
mov (4) SAMPLER_RAMP(0,4)<1> 0x5C585450:vf //7, 6, 5, 4 in float vector
add (8) SAMPLER_RAMP(1)<1> SAMPLER_RAMP(0) 8.0:f
//Module: PrepareScaleCoord.asm
// Setup for sampler msg hdr
mov (2) rMSGSRC.0<1>:ud 0:ud { NoDDClr } // Unused fields
mov (1) rMSGSRC.2<1>:ud 0:ud { NoDDChk } // Write and offset
// Calculate 16 v based on the step Y and vertical origin
mov (16) mfMSGPAYLOAD(2)<1> fSRC_VID_V_ORI<0;1,0>:f
mov (16) SCALE_COORD_Y<1>:f fSRC_VID_V_ORI<0;1,0>:f
// Calculate 16 u based on the step X and hori origin
// line (16) mfMSGPAYLOAD(0)<1> SCALE_STEP_X<0;1,0>:f SAMPLER_RAMP(0) // Assign to mrf directly
mov (16) acc0:f fSRC_VID_H_ORI<0;1,0>:f { Compr }
mac (16) mfMSGPAYLOAD(0)<1> fVIDEO_STEP_X<0;1,0>:f SAMPLER_RAMP(0) { Compr }
//Setup the constants for line instruction
mov (1) SCALE_LINE_P255<1>:f 255.0:f { NoDDClr } //{ NoDDClr, NoDDChk }
mov (1) SCALE_LINE_P0_5<1>:f 0.5:f { NoDDChk }
//------------------------------------------------------------------------------
$for (0; <nY_NUM_OF_ROWS; 1) {
// Read 16 sampled pixels and store them in float32 in 8 GRFs in the order of BGRA (VYUA).
mov (8) MSGHDR_SCALE.0:ud rMSGSRC.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs
send (16) SCALE_RESPONSE_YW(0)<1> MSGHDR_SCALE udDUMMY_NULL nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_Y+nBI_CURRENT_SRC_Y
send (16) SCALE_RESPONSE_UW(0)<1> MSGHDR_SCALE udDUMMY_NULL nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_UV+nBI_CURRENT_SRC_UV
// Calculate 16 v for next line
add (16) mfMSGPAYLOAD(2)<1> SCALE_COORD_Y<8;8,1>:f fVIDEO_STEP_Y<0;1,0>:f // Assign to mrf directly
add (16) SCALE_COORD_Y<1>:f SCALE_COORD_Y<8;8,1>:f fVIDEO_STEP_Y<0;1,0>:f // Assign to mrf directly
// Scale back to [0, 255], convert f to ud
line (16) acc0:f SCALE_LINE_P255<0;1,0>:f SCALE_RESPONSE_YF(0) { Compr } // Process B, V
mov (16) SCALE_RESPONSE_YD(0)<1> acc0:f { Compr }
line (16) acc0:f SCALE_LINE_P255<0;1,0>:f SCALE_RESPONSE_UF(0) { Compr } // Process B, V
mov (16) SCALE_RESPONSE_UD(0)<1> acc0:f { Compr }
line (16) acc0:f SCALE_LINE_P255<0;1,0>:f SCALE_RESPONSE_UF(2) { Compr } // Process B, V
mov (16) SCALE_RESPONSE_UD(2)<1> acc0:f { Compr }
mov (16) DEST_Y(%1)<1> SCALE_RESPONSE_YB(0) //possible error due to truncation - vK
mov (16) DEST_U(%1)<1> SCALE_RESPONSE_UB(0) //possible error due to truncation - vK
mov (16) DEST_V(%1)<1> SCALE_RESPONSE_UB(2) //possible error due to truncation - vK
}
#define nSRC_REGION nREGION_1
//------------------------------------------------------------------------------
This diff is collapsed.
/* Just for test */
send(16) 0 acc0<1>UW g0<8,8,1>UW thread_spawner(0, 0, 0) mlen 1 rlen 0 {align1 EOT};
{ 0x00800031, 0x24001d28, 0x008d0000, 0x87100000 },
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment