/**************************************************************************
 *                                                                        *
 * This code has been developed by John Funnell. This software is an      *
 * implementation of a part of one or more MPEG-4 Video tools as          *
 * specified in ISO/IEC 14496-2 standard.  Those intending to use this    *
 * software module in hardware or software products are advised that its  *
 * use may infringe existing patents or copyrights, and any such use      *
 * would be at such party's own risk.  The original developer of this     *
 * software module and his/her company, and subsequent editors and their  *
 * companies (including Project Mayo), will have no liability for use of  *
 * this software or modifications or derivatives thereof.                 *
 *                                                                        *
 * Project Mayo gives users of the Codec a license to this software       *
 * module or modifications thereof for use in hardware or software        *
 * products claiming conformance to the MPEG-4 Video Standard as          *
 * described in the Open DivX license.                                    *
 *                                                                        *
 * The complete Open DivX license can be found at                         *
 * http://www.projectmayo.com/opendivx/license.php                        *
 *                                                                        *
 **************************************************************************/
/**
*  Copyright (C) 2001 - Project Mayo
 *
 * John Funnell
 *
 * DivX Advanced Research Center <darc@projectmayo.com>
**/
// postprocess.c //

/* Currently this contains only the deblocking filter.  The vertical    */
/* deblocking filter operates over eight pixel-wide columns at once.  The  */
/* horizontal deblocking filter works on four horizontals row at a time. */


#include "postprocess.h"


#define ABS(a)     ( (a)>0 ? (a) : -(a) )
#define SIGN(a)    ( (a)<0 ? -1 : 1 )
#define MIN(a, b)  ( (a)<(b) ? (a) : (b) )
#define MAX(a, b)  ( (a)>(b) ? (a) : (b) )


/***** H O R I Z O N T A L   D E B L O C K I N G   F I L T E R *****/


/* decide DC mode or default mode for the horizontal filter */
static inline int deblock_horiz_useDC(uint8_t *v, int stride) {
	register int eq_cnt=0;
	int x=0, y;

	for (y=0; y<4; y++) {
                register unsigned int a;
                a=v[1+x]-v[2+x]+1;if(a<3) eq_cnt++;
                a=v[2+x]-v[3+x]+1;if(a<3) eq_cnt++;
                a=v[3+x]-v[4+x]+1;if(a<3) eq_cnt++;
                a=v[4+x]-v[5+x]+1;if(a<3) eq_cnt++;
                a=v[5+x]-v[6+x]+1;if(a<3) eq_cnt++;
                a=v[6+x]-v[7+x]+1;if(a<3) eq_cnt++;
                a=v[7+x]-v[8+x]+1;if(a<3) eq_cnt++;
                x+=stride;
	}

	return (eq_cnt >= DEBLOCK_HORIZ_USEDC_THR);
}


/* decide whether the DC filter should be turned on accoding to QP */
static inline int deblock_horiz_DC_on(uint8_t *v, int stride, int QP) {
	/* 99% of the time, this test turns out the same as the |max-min| strategy in the standard */
	return (ABS(v[1]-v[8]) < 2*QP);
}


/* The 9-tap low pass filter used in "DC" regions */
static inline void deblock_horiz_lpf9(uint8_t *v, int stride, int QP) {
	int y;

//        putchar('D');

	for (y=0; y<4; y++) {
                register int psum;
        	register uint8_t *vv;
                uint8_t vnew[8];
                int x=y*stride;
		int p1 = (ABS(v[0+x]-v[1+x]) < QP ) ?  v[0+x] : v[1+x];
		int p2 = (ABS(v[8+x]-v[9+x]) < QP ) ?  v[9+x] : v[8+x];

		/* C implementation of horizontal LPF */
		vv = &(v[x]);
		psum = p1 + p1 + p1 + vv[1] + vv[2] + vv[3] + vv[4] + 4;
		vnew[0] = (((psum + vv[1]) << 1) - (vv[4] - vv[5])) >> 4;
		psum += vv[5] - p1; 
		vnew[1] = (((psum + vv[2]) << 1) - (vv[5] - vv[6])) >> 4;
		psum += vv[6] - p1; 
		vnew[2] = (((psum + vv[3]) << 1) - (vv[6] - vv[7])) >> 4;
		psum += vv[7] - p1; 
		vnew[3] = (((psum + vv[4]) << 1) + p1 - vv[1] - (vv[7] - vv[8])) >> 4;
		psum += vv[8] - vv[1]; 
		vnew[4] = (((psum + vv[5]) << 1) + (vv[1] - vv[2]) - vv[8] + p2) >> 4;
		psum += p2 - vv[2]; 
		vnew[5] = (((psum + vv[6]) << 1) + (vv[2] - vv[3])) >> 4;
		psum += p2 - vv[3]; 
		vnew[6] = (((psum + vv[7]) << 1) + (vv[3] - vv[4])) >> 4;
		psum += p2 - vv[4]; 
		vnew[7] = (((psum + vv[8]) << 1) + (vv[4] - vv[5])) >> 4;
                
//		for (x=1; x<=8; x++) vv[x] = vnew[x];
                *((int*)&vv[1])=*((int*)&vnew[0]);
                *((int*)&vv[5])=*((int*)&vnew[4]);

	}

}


/* horizontal deblocking filter used in default (non-DC) mode */
static inline void deblock_horiz_default_filter(uint8_t *v, int stride, int QP) {
	int y;

//        putchar('A');

	for (y=0; y<4; y++) {

		int q1 = v[4] - v[5];
		int q = q1 / 2;
		if (q) {
			
			int a3_0 = 2*(v[3]-v[6]) - 5*q1;
			
			/* apply the 'delta' function first and check there is a difference to avoid wasting time */
			if (ABS(a3_0) < 8*QP) {
		
				int a3_1 = 2*(v[1]-v[4]) + 5*(v[3]-v[2]);
				int a3_2 = 2*(v[5]-v[8]) + 5*(v[7]-v[8]);
				int d = ABS(a3_0) - MIN(ABS(a3_1), ABS(a3_2));
		
				if (d > 0) { /* energy across boundary is greater than in one or both of the blocks */
					d += d<<2;
					d = (d + 32) >> 6; 
	
					if (d > 0) {
	
						d *= SIGN(-a3_0);
					
						/* clip d in the range 0 ... q */
						if (q > 0) {
							d = d<0 ? 0 : d;
							d = d>q ? q : d;
						} else {
							d = d>0 ? 0 : d;
							d = d<q ? q : d;
						}
						
						v[4] -= d;
						v[5] += d;
		
					}
				}
			}
		}

		v += stride;
	}


}


/* this is a horizontal deblocking filter - i.e. it will smooth _vertical_ block edges */
void deblock_horiz(uint8_t *image, int width, int height, int stride, QP_STORE_T *QP_store, int QP_stride, int chroma) {
	int x;
	
		/* loop over every block boundary in that row */
		for (x=8; x<width; x+=8) {
		
			/* extract QP from the decoder's array of QP values */
			int QP = chroma ? QP_store[x/8] : QP_store[x/16];

			/* v points to pixel v0, in the left-hand block */
			uint8_t *v = &(image[x]) - 5;

#if 0
//			deblock_horiz_lpf9(v, stride, QP); 
			deblock_horiz_default_filter(v, stride, QP);
#else

			/* first decide whether to use default or DC offet mode */ 
			if (deblock_horiz_useDC(v, stride)) { 
                            /* use DC offset mode */
				if (deblock_horiz_DC_on(v, stride, QP)) {
					deblock_horiz_lpf9(v, stride, QP); 
				}
			} else {     
                            /* use default mode */
				deblock_horiz_default_filter(v, stride, QP);
			}
#endif

		}
}


/***** V E R T I C A L   D E B L O C K I N G   F I L T E R *****/


/* decide DC mode or default mode in assembler */
static inline int deblock_vert_useDC(uint8_t *v, int stride) {
	int y;
        int x1=stride;
        int x2=x1+stride;

	/* C-code imlementation of vertial useDC */
	register int eq_cnt = 0;
	for (y=1; y<8; y++) {
                register unsigned int a;
                a=v[x1+0] - v[x2+0]+1; if(a<3) eq_cnt++;
                a=v[x1+1] - v[x2+1]+1; if(a<3) eq_cnt++;
                a=v[x1+2] - v[x2+2]+1; if(a<3) eq_cnt++;
                a=v[x1+3] - v[x2+3]+1; if(a<3) eq_cnt++;
                a=v[x1+4] - v[x2+4]+1; if(a<3) eq_cnt++;
                a=v[x1+5] - v[x2+5]+1; if(a<3) eq_cnt++;
                a=v[x1+6] - v[x2+6]+1; if(a<3) eq_cnt++;
                a=v[x1+7] - v[x2+7]+1; if(a<3) eq_cnt++;
                x1=x2;
                x2+=stride;
	}
        
	return (eq_cnt  > DEBLOCK_VERT_USEDC_THR);
}


/* decide whether the DC filter should be turned on accoding to QP */
static inline int deblock_vert_DC_on(uint8_t *v, int stride, int QP) {
	int DC_on, x;


	/* C implementation of vertical DC_on */
	DC_on = 1;
	for (x=0; x<8; x++) {
		if (ABS(v[x+1*stride]-v[x+8*stride]) > 2 *QP) DC_on = 0;
	}
				
	return DC_on;
}


/* Vertical 9-tap low-pass filter for use in "DC" regions of the picture */
void deblock_vert_lpf9(uint64_t *v_local, uint64_t *p1p2, uint8_t *v, int stride, int QP) {
	int x, y;
	int p1, p2, psum;
	uint8_t  *vv, vnew[9];
	/* define semi-constants to enable us to move up and down the picture easily... */
	int l1 = 1 * stride;
	int l2 = 2 * stride;
	int l3 = 3 * stride;
	int l4 = 4 * stride;
	int l5 = 5 * stride;
	int l6 = 6 * stride;
	int l7 = 7 * stride;
	int l8 = 8 * stride;


	/* simple C implementation of vertical default filter */
	for (x=0; x<8; x++) { /* loop left->right */
		vv = &(v[x]);
		p1 = (ABS(vv[0*stride]-vv[1*stride]) < QP ) ?  vv[0*stride] : vv[1*stride];
		p2 = (ABS(vv[8*stride]-vv[9*stride]) < QP ) ?  vv[9*stride] : vv[8*stride];
		/* the above may well be endian-fussy */
		psum = p1 + p1 + p1 + vv[l1] + vv[l2] + vv[l3] + vv[l4] + 4; 
		vnew[1] = (((psum + vv[l1]) << 1) - (vv[l4] - vv[l5])) >> 4; 
		psum += vv[l5] - p1; 
		vnew[2] = (((psum + vv[l2]) << 1) - (vv[l5] - vv[l6])) >> 4; 
		psum += vv[l6] - p1; 
		vnew[3] = (((psum + vv[l3]) << 1) - (vv[l6] - vv[l7])) >> 4; 
		psum += vv[l7] - p1; 
		vnew[4] = (((psum + vv[l4]) << 1) + p1 - vv[l1] - (vv[l7] - vv[l8])) >> 4; 
		psum += vv[l8] - vv[l1];  
		vnew[5] = (((psum + vv[l5]) << 1) + (vv[l1] - vv[l2]) - vv[l8] + p2) >> 4; 
		psum += p2 - vv[l2];  
		vnew[6] = (((psum + vv[l6]) << 1) + (vv[l2] - vv[l3])) >> 4; 
		psum += p2 - vv[l3]; 
		vnew[7] = (((psum + vv[l7]) << 1) + (vv[l3] - vv[l4])) >> 4; 
		psum += p2 - vv[l4]; 
		vnew[8] = (((psum + vv[l8]) << 1) + (vv[l4] - vv[l5])) >> 4;
		for (y=1; y<=8; y++) {
			vv[y*stride] = vnew[y];
		}  
	}
	

}


/* Vertical deblocking filter for use in non-flat picture regions */
static void deblock_vert_default_filter(uint8_t *v, int stride, int QP) {
	int x;
	/* define semi-constants to enable us to move up and down the picture easily... */
	int l1 = 1 * stride;
	int l2 = 2 * stride;
	int l3 = 3 * stride;
	int l4 = 4 * stride;
	int l5 = 5 * stride;
	int l6 = 6 * stride;
	int l7 = 7 * stride;
	int l8 = 8 * stride;
	
	/* simple C implementation of vertical default filter */
	for (x=0; x<8; x++) {
		int a3_0 = 2*v[l3+x] - 5*v[l4+x] + 5*v[l5+x] - 2*v[l6+x];	
		int a3_1 = 2*v[l1+x] - 5*v[l2+x] + 5*v[l3+x] - 2*v[l4+x];	
		int a3_2 = 2*v[l5+x] - 5*v[l6+x] + 5*v[l7+x] - 2*v[l8+x];	
		int q    = (v[l4+x] - v[l5+x]) / 2;

		if (ABS(a3_0) < 8*QP) {

                        register int d = ABS(a3_0) - MIN(ABS(a3_1), ABS(a3_2));
			if (d < 0) d=0;
				
			d = (5*d + 32) >> 6; 
			d *= SIGN(-a3_0);
							
			//printf("d[%d] preclip=%d\n", x, d);
			/* clip d in the range 0 ... q */
			if (q > 0) {
				d = d<0    ? 0    : d;
				d = d>q ? q : d;
			} else {
				d = d>0    ? 0    : d;
				d = d<q ? q : d;
			}
						
        		v[l4+x] -= d;
	        	v[l5+x] += d;
		}
	}
	
	
}


/* this is a vertical deblocking filter - i.e. it will smooth _horizontal_ block edges */
void deblock_vert( uint8_t *image, int width, int height, int stride, QP_STORE_T *QP_store, int QP_stride, int chroma) {
	uint64_t v_local[20];
	uint64_t p1p2[4];
	int Bx;
	
		/* loop over all blocks, left to right */
		for (Bx=0; Bx<width; Bx+=8) {

			int QP = chroma ? QP_store[Bx/8] : QP_store[Bx/16];	
			int QPx16 = 16 * QP;
			uint8_t *v = &(image[Bx]) - 5*stride;

			/* decide whether to use DC mode on a block-by-block basis */
			if (deblock_vert_useDC(v, stride)) {
 				/* we are in DC mode for this block.  But we only want to filter low-energy areas */
				
				/* decide whether the filter should be on or off for this block */
				if (deblock_vert_DC_on(v, stride, QP)) { /* use DC offset mode */
				
						v = &(image[Bx])- 5*stride;
						
						/* copy the block we're working on and unpack to 16-bit values */
						/* not needed for plain C version */
						//deblock_vert_copy_and_unpack(stride, &(v[stride]), &(v_local[2]), 8);
						//deblock_vert_choose_p1p2(v, stride, p1p2, QP);
					
						deblock_vert_lpf9(v_local, p1p2, v, stride, QP); 

					}
			} else {
                            /* use the default filter */
				v = &(image[Bx])- 5*stride;
				deblock_vert_default_filter(v, stride, QP);
			}
		} 
}


/* this is the deringing filter */
void dering( uint8_t *image, int width, int height, int stride, QP_STORE_T *QP_store, int QP_stride, int chroma) {
	int x, h, v, i, j;
	uint8_t *b8x8, *b10x10;
	uint8_t b8x8filtered[64];
	int QP, max_diff;
	uint8_t min, max, thr, range;
	uint16_t indicesP[10];  /* bitwise array of binary indices above threshold */
	uint16_t indicesN[10];  /* bitwise array of binary indices below threshold */
	uint16_t indices3x3[8]; /* bitwise array of pixels where we should filter */
	uint16_t sr;
	
	/* loop over all the 8x8 blocks in the image... */
	/* don't process outer row of blocks for the time being. */
//	for (y=8; y<height-8; y+=8) {
		for (x=8; x< width-8; x+=8) {
		
			/* QP for this block.. */
			QP = chroma ? QP_store[x/8]
			            : QP_store[x/16];	
	
			/* pointer to the top left pixel in 8x8   block */
			b8x8   = &(image[x]);
			/* pointer to the top left pixel in 10x10 block */
			b10x10 = &(image[-stride + (x-1)]);
			
			/* Threshold detirmination - find min and max grey levels in the block */
			min = 255; max = 0;
			for (v=0; v<8; v++) {
				for (h=0; h<8; h++) {
					min = b8x8[stride*v + h] < min ? b8x8[stride*v + h] : min;				
					max = b8x8[stride*v + h] > max ? b8x8[stride*v + h] : max;				
				}
			} 
			/* Threshold detirmination - compute threshold and dynamic range */
			thr = (max + min + 1) / 2;
			range = max - min;
			
			/* Threshold rearrangement not implemented yet */
			
			/* Index aquisition */
			for (j=0; j<10; j++) {
				indicesP[j] = 0;
				for (i=0; i<10; i++) {
					if (b10x10[j*stride+i] >= thr) indicesP[j] |= (2 << i);
				}
				indicesN[j] = ~indicesP[j];			
			}
			
			/* Adaptive filtering */
			/* need to identify 3x3 blocks of '1's in indicesP and indicesN */
			for (j=0; j<10; j++) {
				indicesP[j] = (indicesP[j]<<1) & indicesP[j] & (indicesP[j]>>1);				
				indicesN[j] = (indicesN[j]<<1) & indicesN[j] & (indicesN[j]>>1);				
			}			
			for (j=1; j<9; j++) {
				indices3x3[j-1]  = indicesP[j-1] & indicesP[j] & indicesP[j+1];				
				indices3x3[j-1] |= indicesN[j-1] & indicesN[j] & indicesN[j+1];				
			}			

			for (v=0; v<8; v++) {
				sr = 4;
				for (h=0; h<8; h++) {
					if (indices3x3[v] & sr) {
						b8x8filtered[8*v + h] = ( 8
						 + 1 * b10x10[stride*(v+0) + (h+0)] + 2 * b10x10[stride*(v+0) + (h+1)] + 1 * b10x10[stride*(v+0) + (h+2)]
						 + 2 * b10x10[stride*(v+1) + (h+0)] + 4 * b10x10[stride*(v+1) + (h+1)] + 2 * b10x10[stride*(v+1) + (h+2)]
						 + 1 * b10x10[stride*(v+2) + (h+0)] + 2 * b10x10[stride*(v+2) + (h+1)] + 1 * b10x10[stride*(v+2) + (h+2)]
						) / 16;
					}
					sr <<= 1;
				}
			}
			
			/* Clipping */
			max_diff = QP/2;
			for (v=0; v<8; v++) {
				sr = 4;
				for (h=0; h<8; h++) {
					if (indices3x3[v] & sr) {
						if        (b8x8filtered[8*v + h] - b8x8[stride*v + h] >  max_diff) {
							b8x8[stride*v + h] = b8x8[stride*v + h] + max_diff;
						} else if (b8x8filtered[8*v + h] - b8x8[stride*v + h] < -max_diff) {
							b8x8[stride*v + h] = b8x8[stride*v + h] - max_diff;	
						} else {
							b8x8[stride*v + h] = b8x8filtered[8*v + h];
						}  								
					}
					sr <<= 1;
				}
			}

		}
//	}


}


/* This function is more or less what Andrea wanted: */
void postprocess_orig(unsigned char * src[], int src_stride,
                 unsigned char * dst[], int dst_stride, 
                 int horizontal_size,   int vertical_size, 
                 QP_STORE_T *QP_store,  int QP_stride,
					  int mode) {
					  
	uint8_t *Y, *U, *V;
	int x, y;

	if (!(mode & PP_DONT_COPY)) {
		/* First copy source to destination... */
		/* luma */
		for (y=0; y<vertical_size; y++) {
			for (x=0; x<horizontal_size; x++) {
				(dst[0])[y*dst_stride + x] = (src[0])[y*src_stride + x];
			}
		}
		/* chroma */
		for (y=0; y<vertical_size/2; y++) {
			for (x=0; x<horizontal_size/2; x++) {
				(dst[1])[y*dst_stride/2 + x] = (src[1])[y*src_stride/2 + x];
				(dst[2])[y*dst_stride/2 + x] = (src[2])[y*src_stride/2 + x];
			}
		}
	}
					  
	Y = dst[0];
	U = dst[1];
	V = dst[2];
	
	if (mode & PP_DEBLOCK_Y_H) {
		deblock_horiz(Y, horizontal_size,   vertical_size,   dst_stride, QP_store, QP_stride, 0);
	}
	if (mode & PP_DEBLOCK_Y_V) {
		deblock_vert( Y, horizontal_size,   vertical_size,   dst_stride, QP_store, QP_stride, 0);
	}
	if (mode & PP_DEBLOCK_C_H) {
		deblock_horiz(U, horizontal_size/2, vertical_size/2, dst_stride, QP_store, QP_stride, 1);
		deblock_horiz(V, horizontal_size/2, vertical_size/2, dst_stride, QP_store, QP_stride, 1);
	}
	if (mode & PP_DEBLOCK_C_V) {
		deblock_vert( U, horizontal_size/2, vertical_size/2, dst_stride, QP_store, QP_stride, 1);
		deblock_vert( V, horizontal_size/2, vertical_size/2, dst_stride, QP_store, QP_stride, 1);
	}
	if (mode & PP_DERING_Y) {
		dering(       Y, horizontal_size,   vertical_size,   dst_stride, QP_store, QP_stride, 0);
	}
	if (mode & PP_DERING_C) {
		dering(       U, horizontal_size/2, vertical_size/2, dst_stride, QP_store, QP_stride, 1);
		dering(       V, horizontal_size/2, vertical_size/2, dst_stride, QP_store, QP_stride, 1);
	}

}

inline void fast_copy(unsigned char *src, int src_stride,
                 unsigned char *dst, int dst_stride, 
                 int horizontal_size,   int vertical_size) {
    while(vertical_size--){
        register int *s=(int *)src;
        register int *d=(int *)dst;
        register int a,b;
        int *e=(int*)(&src[horizontal_size&(~7)]);
        while(s<e){
          a=s[0];
          b=s[1];
          s+=2;
          d[0]=a;
          d[1]=b;
          d+=2;
        }
        src+=src_stride;
        dst+=dst_stride;
    }
}

void postprocess(unsigned char * src[], int src_stride,
                 unsigned char * dst[], int dst_stride, 
                 int horizontal_size,   int vertical_size, 
                 QP_STORE_T *QP_store,  int QP_stride,
					  int mode) {
					  
	uint8_t *puc_src;
	uint8_t *puc_dst;
	uint8_t *puc_flt;
	QP_STORE_T *QP_ptr;
	int y, i;


	/* this loop is (hopefully) going to improve performance */
	/* loop down the picture, copying and processing in vertical stripes, each four pixels high */
	for (y=0; y<vertical_size; y+= 4) {
		
		if (!(mode & PP_DONT_COPY)) {
			puc_src = &((src[0])[y*src_stride]);
			puc_dst = &((dst[0])[y*dst_stride]);

			/* First copy source to destination... */
			fast_copy(puc_src, src_stride, puc_dst, dst_stride, horizontal_size, 4);
		}
		
		if (mode & PP_DEBLOCK_Y_H) {
			puc_flt = &((dst[0])[y*dst_stride]);  
			QP_ptr  = &(QP_store[(y>>4)*QP_stride]);
			deblock_horiz(puc_flt, horizontal_size, 4,     dst_stride, QP_ptr, QP_stride, 0);
		}

		if (mode & PP_DEBLOCK_Y_V) { 
			if ( ((y&7)) && ((y-4)>=8) )   {
				puc_flt = &((dst[0])[(y-4)*dst_stride]);  
				QP_ptr  = &(QP_store[((y-4)>>4)*QP_stride]);
				deblock_vert( puc_flt, horizontal_size, 4,  dst_stride, QP_ptr, QP_stride, 0);
			}
		}

		if (mode & PP_DERING_Y) {
			if ( (y%8) && (y-4)>5 )   {
				puc_flt = &((dst[0])[y*dst_stride]);  
				QP_ptr  = &(QP_store[(y>>4)*QP_stride]);
				dering( puc_flt, horizontal_size, 4,  dst_stride, QP_ptr, QP_stride, 0);
			}
//			dering(       puc_flt, horizontal_size, 4,  dst_stride, QP_ptr, QP_stride, 0);
		}

	} /* for loop */

	/* now we're going to do U and V assuming 4:2:0 */
	horizontal_size >>= 1;
	vertical_size   >>= 1;
	src_stride      >>= 1;
	dst_stride      >>= 1;


	/* loop U then V */
	for (i=1; i<=2; i++) {

	for (y=0; y<vertical_size; y+= 4) {
		
		if (!(mode & PP_DONT_COPY)) {
			puc_src = &((src[i])[y*src_stride]);
			puc_dst = &((dst[i])[y*dst_stride]);

			/* First copy source to destination... */
			fast_copy(puc_src, src_stride, puc_dst, dst_stride, horizontal_size, 4);
		}
		
		if (mode & PP_DEBLOCK_C_H) {
			puc_flt = &((dst[i])[y*dst_stride]);  
			QP_ptr  = &(QP_store[(y>>3)*QP_stride]);
			deblock_horiz(puc_flt, horizontal_size,  4,    dst_stride, QP_ptr, QP_stride, 1);
		}

		if (mode & PP_DEBLOCK_C_V) { 
			if ( ((y&7)) && ((y-4)>=8) ) {
				puc_flt = &((dst[0])[(y-4)*dst_stride]);  
				QP_ptr  = &(QP_store[((y-4)>>4)*QP_stride]);
				deblock_vert( puc_flt, horizontal_size, 4,  dst_stride, QP_ptr, QP_stride, 1);
			}
		}

		if (mode & PP_DERING_C) {
			if ( (y%8) && (y-4)>5 )   {
				puc_flt = &((dst[i])[y*dst_stride]);  
				QP_ptr  = &(QP_store[(y>>4)*QP_stride]);
				dering( puc_flt, horizontal_size, 4,  dst_stride, QP_ptr, QP_stride, 1);
			}
//			dering(       puc_flt, horizontal_size, 4,  dst_stride, QP_ptr, QP_stride, 1);
		}

	} /* stripe loop */

	} /* U,V loop */


}