pngvcrd.c revision f9a51917495bc8ba8b60632219652a7b122c1190
*
* For Intel x86 CPU and Microsoft Visual C++ compiler
*
* libpng version 1.2.8 - December 3, 2004
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1998-2004 Glenn Randers-Pehrson
* Copyright (c) 1998, Intel Corporation
*
* Contributed by Nirav Chhatrapati, Intel Corporation, 1998
* Interface to libpng contributed by Gilles Vollant, 1999
*
*
* In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
* a sign error in the post-MMX cleanup code for each pixel_depth resulted
* in bad pixels at the beginning of some rows of some images, and also
* (due to out-of-range memory reads and writes) caused heap corruption
* when compiled with MSVC 6.0. The error was fixed in version 1.0.4e.
*
* [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
*
* [runtime MMX configuration, GRR 20010102]
*
*/
#define PNG_INTERNAL
#include "png.h"
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
static int mmx_supported=2;
int PNGAPI
png_mmx_support(void)
{
int mmx_supported_local = 0;
_asm {
pushfd //Save Eflag to stack
popfd //Restored modified value back to Eflag reg
pushfd //Save Eflag to stack
popfd // restore original Eflag
//skip following instructions and jump to
//NOT_SUPPORTED label
//faster than the instruction "mov eax, 1"
}
//mmx_supported_local=0; // test code for force don't support MMX
//printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
return mmx_supported_local;
}
/* Combines the row recently read in with the previous row.
This routine takes care of alpha and transparency if requested.
This routine also handles the two methods of progressive display
of interlaced images, depending on the mask value.
The mask value describes which pixels are to be combined with
the row. The pattern always repeats every 8 pixels, so just 8
bits are needed. A one indicates the pixel is to be combined; a
zero indicates the pixel is to be skipped. This is in addition
to any alpha or transparency value associated with the pixel. If
you want all pixels to be combined, pass 0xff (255) in mask. */
/* Use this routine for x86 platform - uses faster MMX routine if machine
supports MMX */
void /* PRIVATE */
{
#ifdef PNG_USE_LOCAL_ARRAYS
#endif
if (mmx_supported == 2) {
#if !defined(PNG_1_0_X)
/* this should have happened in png_init_mmx_flags() already */
#endif
}
if (mask == 0xff)
{
}
/* GRR: add "else if (mask == 0)" case?
* or does png_combine_row() not even get called in that case? */
else
{
{
case 1:
{
int m;
int shift;
png_uint_32 i;
m = 0x80;
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
{
s_start = 0;
s_end = 7;
s_inc = 1;
}
else
#endif
{
s_start = 7;
s_end = 0;
s_inc = -1;
}
{
if (m & mask)
{
int value;
}
{
sp++;
dp++;
}
else
if (m == 1)
m = 0x80;
else
m >>= 1;
}
break;
}
case 2:
{
int m;
int shift;
png_uint_32 i;
int value;
m = 0x80;
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
{
s_start = 0;
s_end = 6;
s_inc = 2;
}
else
#endif
{
s_start = 6;
s_end = 0;
s_inc = -2;
}
{
if (m & mask)
{
}
{
sp++;
dp++;
}
else
if (m == 1)
m = 0x80;
else
m >>= 1;
}
break;
}
case 4:
{
int m;
int shift;
png_uint_32 i;
int value;
m = 0x80;
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
{
s_start = 0;
s_end = 4;
s_inc = 4;
}
else
#endif
{
s_start = 4;
s_end = 0;
s_inc = -4;
}
{
if (m & mask)
{
}
{
sp++;
dp++;
}
else
if (m == 1)
m = 0x80;
else
m >>= 1;
}
break;
}
case 8:
{
int m;
#if !defined(PNG_1_0_X)
/* && mmx_supported */ )
#else
if (mmx_supported)
#endif
{
m = 0x80;
{
end8:
}
}
else /* mmx not supported - use modified C routine */
{
png_uint_32 i;
{
}
} /* end of else */
break;
} // end 8 bpp
case 16:
{
mask0=0x1010202040408080;
#if !defined(PNG_1_0_X)
/* && mmx_supported */ )
#else
if (mmx_supported)
#endif
{
{
}
}
else /* mmx not supported - use modified C routine */
{
png_uint_32 i;
{
}
} /* end of else */
break;
} // end 16 bpp
case 24:
{
mask1=0x0408080810101020,
mask0=0x2020404040808080;
#if !defined(PNG_1_0_X)
/* && mmx_supported */ )
#else
if (mmx_supported)
#endif
{
{
}
}
else /* mmx not supported - use modified C routine */
{
png_uint_32 i;
{
}
} /* end of else */
break;
} // end 24 bpp
case 32:
{
mask2=0x0404040408080808,
mask1=0x1010101020202020,
mask0=0x4040404080808080;
#if !defined(PNG_1_0_X)
/* && mmx_supported */ )
#else
if (mmx_supported)
#endif
{
{
}
}
else /* mmx _not supported - Use modified C routine */
{
png_uint_32 i;
{
}
} /* end of else */
break;
} // end 32 bpp
case 48:
{
mask4=0x0202020204040404,
mask3=0x0404080808080808,
mask2=0x1010101010102020,
mask1=0x2020202040404040,
mask0=0x4040808080808080;
#if !defined(PNG_1_0_X)
/* && mmx_supported */ )
#else
if (mmx_supported)
#endif
{
{
}
}
else /* mmx _not supported - Use modified C routine */
{
png_uint_32 i;
{
}
} /* end of else */
break;
} // end 48 bpp
default:
{
unsigned int i;
{
}
break;
}
} /* end switch (png_ptr->row_info.pixel_depth) */
} /* end if (non-trivial mask) */
} /* end png_combine_row() */
#if defined(PNG_READ_INTERLACING_SUPPORTED)
void /* PRIVATE */
{
#ifdef PNG_USE_LOCAL_ARRAYS
#endif
if (mmx_supported == 2) {
#if !defined(PNG_1_0_X)
/* this should have happened in png_init_mmx_flags() already */
#endif
}
{
switch (row_info->pixel_depth)
{
case 1:
{
png_byte v;
png_uint_32 i;
int j;
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
if (transformations & PNG_PACKSWAP)
{
s_start = 7;
s_end = 0;
s_inc = -1;
}
else
#endif
{
s_start = 0;
s_end = 7;
s_inc = 1;
}
{
for (j = 0; j < png_pass_inc[pass]; j++)
{
{
dp--;
}
else
}
{
sp--;
}
else
}
break;
}
case 2:
{
png_uint_32 i;
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
if (transformations & PNG_PACKSWAP)
{
s_start = 6;
s_end = 0;
s_inc = -2;
}
else
#endif
{
s_start = 0;
s_end = 6;
s_inc = 2;
}
{
png_byte v;
int j;
for (j = 0; j < png_pass_inc[pass]; j++)
{
{
dp--;
}
else
}
{
sp--;
}
else
}
break;
}
case 4:
{
png_uint_32 i;
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
if (transformations & PNG_PACKSWAP)
{
s_start = 4;
s_end = 0;
s_inc = -4;
}
else
#endif
{
s_start = 0;
s_end = 4;
s_inc = 4;
}
{
png_byte v;
int j;
for (j = 0; j < png_pass_inc[pass]; j++)
{
{
dp--;
}
else
}
{
sp--;
}
else
}
break;
}
default: // This is the place where the routine is modified
{
// __int64 const5 = 0x000000FFFFFF0000; // unused...
png_uint_32 i;
// New code by Nirav Chhatrapati - Intel Corporation
// sign fix by GRR
// NOTE: there is NO MMX code for 48-bit and 64-bit images
// use MMX routine if machine supports it
#if !defined(PNG_1_0_X)
/* && mmx_supported */ )
#else
if (mmx_supported)
#endif
{
if (pixel_bytes == 3)
{
{
{
//sub esi, 3
}
}
{
{
}
}
else if (width) /* && ((pass == 4) || (pass == 5)) */
{
if (width_mmx < 0)
width_mmx = 0;
if (width_mmx)
{
{
}
}
for (i = width; i; i--)
{
png_byte v[8];
int j;
for (j = 0; j < png_pass_inc[pass]; j++)
{
dp -= 3;
}
sptr -= 3;
}
}
} /* end of pixel_bytes == 3 */
else if (pixel_bytes == 1)
{
{
if (width_mmx)
{
{
}
}
for (i = width; i; i--)
{
int j;
/* I simplified this part in version 1.0.4e
* here and in several other instances where
* pixel_bytes == 1 -- GR-P
*
* Original code:
*
* png_byte v[8];
* png_memcpy(v, sptr, pixel_bytes);
* for (j = 0; j < png_pass_inc[pass]; j++)
* {
* png_memcpy(dp, v, pixel_bytes);
* dp -= pixel_bytes;
* }
* sptr -= pixel_bytes;
*
* Replacement code is in the next three lines:
*/
for (j = 0; j < png_pass_inc[pass]; j++)
sptr--;
}
}
{
if (width_mmx)
{
{
}
}
for (i = width; i; i--)
{
int j;
for (j = 0; j < png_pass_inc[pass]; j++)
{
}
sptr --;
}
}
else if (width) /* && ((pass == 4) || (pass == 5))) */
{
if (width_mmx)
{
{
//movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
//sub esi, 4
}
}
for (i = width; i; i--)
{
int j;
for (j = 0; j < png_pass_inc[pass]; j++)
{
}
sptr --;
}
}
} /* end of pixel_bytes == 1 */
else if (pixel_bytes == 2)
{
{
if (width_mmx)
{
{
}
}
for (i = width; i; i--)
{
png_byte v[8];
int j;
sptr -= 2;
for (j = 0; j < png_pass_inc[pass]; j++)
{
dp -= 2;
}
}
}
{
if (width_mmx)
{
{
//sub esi, 4
}
}
for (i = width; i; i--)
{
png_byte v[8];
int j;
sptr -= 2;
for (j = 0; j < png_pass_inc[pass]; j++)
{
dp -= 2;
}
}
}
else if (width) // pass == 4 or 5
{
if (width_mmx)
{
{
}
}
for (i = width; i; i--)
{
png_byte v[8];
int j;
sptr -= 2;
for (j = 0; j < png_pass_inc[pass]; j++)
{
dp -= 2;
}
}
}
} /* end of pixel_bytes == 2 */
else if (pixel_bytes == 4)
{
{
if (width_mmx)
{
{
}
}
for (i = width; i; i--)
{
png_byte v[8];
int j;
sptr -= 4;
for (j = 0; j < png_pass_inc[pass]; j++)
{
dp -= 4;
}
}
}
{
if (width_mmx)
{
{
}
}
for (i = width; i; i--)
{
png_byte v[8];
int j;
sptr -= 4;
for (j = 0; j < png_pass_inc[pass]; j++)
{
dp -= 4;
}
}
}
else if (width) // pass == 4 or 5
{
if (width_mmx)
{
{
}
}
for (i = width; i; i--)
{
png_byte v[8];
int j;
sptr -= 4;
for (j = 0; j < png_pass_inc[pass]; j++)
{
dp -= 4;
}
}
}
} /* end of pixel_bytes == 4 */
else if (pixel_bytes == 6)
{
for (i = width; i; i--)
{
png_byte v[8];
int j;
for (j = 0; j < png_pass_inc[pass]; j++)
{
dp -= 6;
}
sptr -= 6;
}
} /* end of pixel_bytes == 6 */
else
{
for (i = width; i; i--)
{
png_byte v[8];
int j;
for (j = 0; j < png_pass_inc[pass]; j++)
{
dp -= pixel_bytes;
}
sptr-= pixel_bytes;
}
}
} /* end of mmx_supported */
else /* MMX not supported: use modified C code - takes advantage
* of inlining of memcpy for a constant */
{
if (pixel_bytes == 1)
{
for (i = width; i; i--)
{
int j;
for (j = 0; j < png_pass_inc[pass]; j++)
sptr--;
}
}
else if (pixel_bytes == 3)
{
for (i = width; i; i--)
{
png_byte v[8];
int j;
for (j = 0; j < png_pass_inc[pass]; j++)
{
dp -= pixel_bytes;
}
sptr -= pixel_bytes;
}
}
else if (pixel_bytes == 2)
{
for (i = width; i; i--)
{
png_byte v[8];
int j;
for (j = 0; j < png_pass_inc[pass]; j++)
{
dp -= pixel_bytes;
}
sptr -= pixel_bytes;
}
}
else if (pixel_bytes == 4)
{
for (i = width; i; i--)
{
png_byte v[8];
int j;
for (j = 0; j < png_pass_inc[pass]; j++)
{
dp -= pixel_bytes;
}
sptr -= pixel_bytes;
}
}
else if (pixel_bytes == 6)
{
for (i = width; i; i--)
{
png_byte v[8];
int j;
for (j = 0; j < png_pass_inc[pass]; j++)
{
dp -= pixel_bytes;
}
sptr -= pixel_bytes;
}
}
else
{
for (i = width; i; i--)
{
png_byte v[8];
int j;
for (j = 0; j < png_pass_inc[pass]; j++)
{
dp -= pixel_bytes;
}
sptr -= pixel_bytes;
}
}
} /* end of MMX not supported */
break;
}
} /* end switch (row_info->pixel_depth) */
}
}
#endif /* PNG_READ_INTERLACING_SUPPORTED */
// These variables are utilized in the functions below. They are declared
// globally here to ensure alignment on 8-byte boundaries.
union uAll {
double align;
} LBCarryMask = {0x0101010101010101},
HBClearMask = {0x7f7f7f7f7f7f7f7f},
// Optimized code for PNG Average filter decoder
void /* PRIVATE */
{
int bpp;
//png_uint_32 len;
int diff;
_asm {
// Init address pointers and offset
// Compute the Raw value for the first bpp bytes
// Raw(x) = Avg(x) + (Prior(x)/2)
// mov does not affect flags; -1 to offset inc ebx
// get # of bytes to alignment
// fix alignment
// Compute the Raw value for the bytes upto the alignment boundary
// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
// mov does not affect flags; -1 to offset inc ebx
} // end _asm block
// Now do the math for the rest of the row
switch ( bpp )
{
case 3:
{
_asm {
// Re-init address pointers and offset
// PRIME the pump (load the first Raw(x-bpp) data set
// (we correct position in loop below)
// Add (Prev_row/2) to Average
// Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
// lsb's were == 1 (Only valid for active group)
// byte
// Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
// lsb's were == 1 (Only valid for active group)
// byte
// Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
// bytes
// Data only needs to be shifted once here to
// get the correct x-bpp offset.
// lsb's were == 1 (Only valid for active group)
// byte
// Now ready to write back to memory
// Move updated Raw(x) to use as Raw(x-bpp) for next loop
} // end _asm block
}
break;
case 6:
case 4:
case 7:
case 5:
{
// appropriate inactive bytes
_asm {
// Re-init address pointers and offset
// Load ActiveMask and clear all bytes except for 1st active group
// PRIME the pump (load the first Raw(x-bpp) data set
// (we correct position in loop below)
// Add (Prev_row/2) to Average
// Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
// lsb's were == 1 (Only valid for active group)
// byte
// Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
// lsb's were == 1 (Only valid for active group)
// byte
// Now ready to write back to memory
// Prep Raw(x-bpp) for next loop
} // end _asm block
}
break;
case 2:
{
_asm {
// Load ActiveMask
// Re-init address pointers and offset
// PRIME the pump (load the first Raw(x-bpp) data set
// (we correct position in loop below)
// Add (Prev_row/2) to Average
// Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
// lsb's were == 1 (Only valid for active group)
// Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
// lsb's were == 1 (Only valid for active group)
// Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
// Data only needs to be shifted once here to
// get the correct x-bpp offset.
// lsb's were == 1 (Only valid for active group)
// Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
// Data only needs to be shifted once here to
// get the correct x-bpp offset.
// lsb's were == 1 (Only valid for active group)
// Now ready to write back to memory
// Prep Raw(x-bpp) for next loop
} // end _asm block
}
break;
case 1: // bpp == 1
{
_asm {
// Re-init address pointers and offset
// Do Paeth decode for remaining bytes
// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
// mov does not affect flags; -1 to offset inc ebx
} // end _asm block
}
return;
case 8: // bpp == 8
{
_asm {
// Re-init address pointers and offset
// PRIME the pump (load the first Raw(x-bpp) data set
// (NO NEED to correct position in loop below)
// lsb's were == 1
} // end _asm block
}
break;
default: // bpp greater than 8
{
_asm {
// Re-init address pointers and offset
// lsb's were == 1
} // end _asm block
}
break;
} // end switch ( bpp )
_asm {
// MMX acceleration complete now do clean-up
// Check if any remaining bytes left to decode
// Do Paeth decode for remaining bytes
// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
// mov does not affect flags; -1 to offset inc ebx
emms // End MMX instructions; prep for possible FP instrs.
} // end _asm block
}
// Optimized code for PNG Paeth filter decoder
void /* PRIVATE */
{
//png_uint_32 len;
int bpp;
int diff;
//int ptemp;
{
// Compute the Raw value for the first bpp bytes
// Note: the formula works out to be always
// Paeth(x) = Raw(x) + Prior(x) where x < bpp
// get # of bytes to alignment
// fix alignment
// pav = p - a = (a + b - c) - a = b - c
// pbv = p - b = (a + b - c) - b = a - c
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
// pc = abs(pcv)
// pb = abs(pbv)
// pa = abs(pav)
// test if pa <= pb
// pa > pb; now test if pb <= pc
// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
// pa <= pb; now test if pa <= pc
// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
} // end _asm block
// Now do the math for the rest of the row
switch ( bpp )
{
case 3:
{
{
// PRIME the pump (load the first Raw(x-bpp) data set
// pav = p - a = (a + b - c) - a = b - c
// pbv = p - b = (a + b - c) - b = a - c
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
// pa = abs(p-a) = abs(pav)
// pb = abs(p-b) = abs(pbv)
// pc = abs(p-c) = abs(pcv)
// test pa <= pb
// use mm7 mask to merge pa & pb
// use mm0 mask copy to merge a & b
// test ((pa <= pb)? pa:pb) <= pc
// Now do Paeth for 2nd set of bytes (3-5)
// pbv = p - b = (a + b - c) - b = a - c
// pav = p - a = (a + b - c) - a = b - c
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
// pav + pbv = pbv + pav
// pa = abs(p-a) = abs(pav)
// pb = abs(p-b) = abs(pbv)
// pc = abs(p-c) = abs(pcv)
// test pa <= pb
// use mm7 mask to merge pa & pb
// use mm0 mask copy to merge a & b
// test ((pa <= pb)? pa:pb) <= pc
// pav = p - a = (a + b - c) - a = b - c
// Now mm1 will be used as Raw(x-bpp)
// Now do Paeth for 3rd, and final, set of bytes (6-7)
// pbv = p - b = (a + b - c) - b = a - c
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
// pa = abs(p-a) = abs(pav)
// pb = abs(p-b) = abs(pbv)
// pc = abs(p-c) = abs(pcv)
// test pa <= pb
// use mm0 mask copy to merge a & b
// use mm7 mask to merge pa & pb
// test ((pa <= pb)? pa:pb) <= pc
// Step ebx to next set of 8 bytes and repeat loop til done
// mm1 will be used as Raw(x-bpp) next loop
// mm3 ready to be used as Prior(x-bpp) next loop
} // end _asm block
}
break;
case 6:
case 7:
case 5:
{
{
// PRIME the pump (load the first Raw(x-bpp) data set
// Must shift to position Raw(x-bpp) data
// Do first set of 4 bytes
// Must shift to position Prior(x-bpp) data
// pav = p - a = (a + b - c) - a = b - c
// pbv = p - b = (a + b - c) - b = a - c
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
// pa = abs(p-a) = abs(pav)
// pb = abs(p-b) = abs(pbv)
// pc = abs(p-c) = abs(pcv)
// test pa <= pb
// use mm7 mask to merge pa & pb
// use mm0 mask copy to merge a & b
// test ((pa <= pb)? pa:pb) <= pc
// Do second set of 4 bytes
// pav = p - a = (a + b - c) - a = b - c
// pbv = p - b = (a + b - c) - b = a - c
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
// pa = abs(p-a) = abs(pav)
// pb = abs(p-b) = abs(pbv)
// pc = abs(p-c) = abs(pcv)
// test pa <= pb
// use mm7 mask to merge pa & pb
// use mm0 mask copy to merge a & b
// test ((pa <= pb)? pa:pb) <= pc
// Step ex to next set of 8 bytes and repeat loop til done
// mm1 will be used as Raw(x-bpp) next loop
} // end _asm block
}
break;
case 4:
{
_asm {
// PRIME the pump (load the first Raw(x-bpp) data set
// a=Raw(x-bpp) bytes
// Do first set of 4 bytes
// pav = p - a = (a + b - c) - a = b - c
// pbv = p - b = (a + b - c) - b = a - c
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
// pa = abs(p-a) = abs(pav)
// pb = abs(p-b) = abs(pbv)
// pc = abs(p-c) = abs(pcv)
// test pa <= pb
// use mm7 mask to merge pa & pb
// use mm0 mask copy to merge a & b
// test ((pa <= pb)? pa:pb) <= pc
// Do second set of 4 bytes
// pav = p - a = (a + b - c) - a = b - c
// pbv = p - b = (a + b - c) - b = a - c
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
// pa = abs(p-a) = abs(pav)
// pb = abs(p-b) = abs(pbv)
// pc = abs(p-c) = abs(pcv)
// test pa <= pb
// use mm7 mask to merge pa & pb
// use mm0 mask copy to merge a & b
// test ((pa <= pb)? pa:pb) <= pc
// Step ex to next set of 8 bytes and repeat loop til done
// mm1 will be used as Raw(x-bpp) next loop
} // end _asm block
}
break;
case 8: // bpp == 8
{
_asm {
// PRIME the pump (load the first Raw(x-bpp) data set
// a=Raw(x-bpp) bytes
// Do first set of 4 bytes
// pav = p - a = (a + b - c) - a = b - c
// pbv = p - b = (a + b - c) - b = a - c
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
// pa = abs(p-a) = abs(pav)
// pb = abs(p-b) = abs(pbv)
// pc = abs(p-c) = abs(pcv)
// test pa <= pb
// use mm7 mask to merge pa & pb
// use mm0 mask copy to merge a & b
// test ((pa <= pb)? pa:pb) <= pc
// Do second set of 4 bytes
// pav = p - a = (a + b - c) - a = b - c
// pbv = p - b = (a + b - c) - b = a - c
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
// pa = abs(p-a) = abs(pav)
// pb = abs(p-b) = abs(pbv)
// pc = abs(p-c) = abs(pcv)
// test pa <= pb
// use mm7 mask to merge pa & pb
// use mm0 mask copy to merge a & b
// test ((pa <= pb)? pa:pb) <= pc
// Step ex to next set of 8 bytes and repeat loop til done
// mm1 will be used as Raw(x-bpp) next loop
} // end _asm block
}
break;
case 1: // bpp = 1
case 2: // bpp = 2
default: // bpp > 8
{
_asm {
// Do Paeth decode for remaining bytes
// pav = p - a = (a + b - c) - a = b - c
// pbv = p - b = (a + b - c) - b = a - c
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
// pc = abs(pcv)
// pb = abs(pbv)
// pa = abs(pav)
// test if pa <= pb
// pa > pb; now test if pb <= pc
// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
// pa <= pb; now test if pa <= pc
// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
} // end _asm block
}
return; // No need to go further with this one
} // end switch ( bpp )
{
// MMX acceleration complete now do clean-up
// Check if any remaining bytes left to decode
// Do Paeth decode for remaining bytes
// pav = p - a = (a + b - c) - a = b - c
// pbv = p - b = (a + b - c) - b = a - c
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
// pc = abs(pcv)
// pb = abs(pbv)
// pa = abs(pav)
// test if pa <= pb
// pa > pb; now test if pb <= pc
// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
// pa <= pb; now test if pa <= pc
// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
emms // End MMX instructions; prep for possible FP instrs.
} // end _asm block
}
// Optimized code for PNG Sub filter decoder
void /* PRIVATE */
{
//int test;
int bpp;
int diff;
_asm {
// get # of bytes to alignment
// alignment boundary
// ebx at alignment
// fix alignment
} // end _asm block
// Now do the math for the rest of the row
switch ( bpp )
{
case 3:
{
_asm {
// byte group
// PRIME the pump (load the first Raw(x-bpp) data set
// no need for mask; shift clears inactive bytes
// Add 1st active group
// Add 2nd active group
// Add 3rd active group
// Prep for doing 1st add at top of loop
} // end _asm block
}
break;
case 1:
{
// Placed here just in case this is a duplicate of the
// non-MMX code for the SUB filter in png_read_filter_row below
//
// png_bytep rp;
// png_bytep lp;
// png_uint_32 i;
// bpp = (row_info->pixel_depth + 7) >> 3;
// for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
// i < row_info->rowbytes; i++, rp++, lp++)
// {
// *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
// }
_asm {
} // end _asm block
}
return;
case 6:
case 7:
case 4:
case 5:
{
_asm {
// PRIME the pump (load the first Raw(x-bpp) data set
// no need for mask; shift clears inactive bytes
// Add 2nd active group
// there is no need for any mask
} // end _asm block
}
break;
case 2:
{
_asm {
// byte group
// byte group
// PRIME the pump (load the first Raw(x-bpp) data set
// Add 1st active group
// no need for mask; shift clears inactive
// bytes
// Add 2nd active group
// Add 3rd active group
// Add 4th active group
} // end _asm block
}
break;
case 8:
{
_asm {
// Raw(x-bpp) data set
// Now mm0 will be used as Raw(x-bpp) for
// the 2nd group of 8 bytes. This will be
// repeated for each group of 8 bytes with
// the 8th group being used as the Raw(x-bpp)
// for the 1st group of the next loop.
// be the new Raw(x-bpp) for the next loop
} // end _asm block
}
break;
default: // bpp greater than 8 bytes
{
_asm {
// add ebx
} // end _asm block
}
break;
} // end switch ( bpp )
_asm {
emms // End MMX instructions; prep for possible FP instrs.
} // end _asm block
}
// Optimized code for PNG Up filter decoder
void /* PRIVATE */
{
_asm {
// get # of bytes to alignment
// fix alignment
// Unrolled loop - use all MMX registers and interleave to reduce
// number of branch instructions (loops) and reduce partial stalls
// -8 to offset add ebx
// 2 lines added by lcreeve at netins.net
// (mail 11 Jul 98 in png-implement list)
// Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
// Loop using x86 registers to update remaining bytes
// Conversion of filtered row completed
emms // End MMX instructions; prep for possible FP instrs.
} // end _asm block
}
// Optimized png_read_filter_row routines
void /* PRIVATE */
{
#ifdef PNG_DEBUG
char filnm[10];
#endif
if (mmx_supported == 2) {
#if !defined(PNG_1_0_X)
/* this should have happened in png_init_mmx_flags() already */
#endif
}
#ifdef PNG_DEBUG
switch (filter)
{
break;
#if !defined(PNG_1_0_X)
break;
break;
break;
break;
#else
break;
break;
break;
break;
#endif
break;
}
#endif /* PNG_DEBUG */
switch (filter)
{
case PNG_FILTER_VALUE_NONE:
break;
case PNG_FILTER_VALUE_SUB:
{
#if !defined(PNG_1_0_X)
#else
if (mmx_supported)
#endif
{
}
else
{
png_uint_32 i;
{
rp++;
}
}
break;
}
case PNG_FILTER_VALUE_UP:
{
#if !defined(PNG_1_0_X)
#else
if (mmx_supported)
#endif
{
}
else
{
png_uint_32 i;
for (i = 0; i < istop; ++i)
{
rp++;
}
}
break;
}
case PNG_FILTER_VALUE_AVG:
{
#if !defined(PNG_1_0_X)
#else
if (mmx_supported)
#endif
{
}
else
{
png_uint_32 i;
for (i = 0; i < bpp; i++)
{
rp++;
}
for (i = 0; i < istop; i++)
{
rp++;
}
}
break;
}
case PNG_FILTER_VALUE_PAETH:
{
#if !defined(PNG_1_0_X)
#else
if (mmx_supported)
#endif
{
}
else
{
png_uint_32 i;
for (i = 0; i < bpp; i++)
{
rp++;
}
for (i = 0; i < istop; i++) // use leftover rp,pp
{
a = *lp++;
b = *pp++;
c = *cp++;
p = b - c;
pc = a - c;
#ifdef PNG_USE_ABS
#else
pa = p < 0 ? -p : p;
#endif
/*
if (pa <= pb && pa <= pc)
p = a;
else if (pb <= pc)
p = b;
else
p = c;
*/
rp++;
}
}
break;
}
default:
*row=0;
break;
}
}
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGVCRD */