20814N/A@@ -44,7 +44,7 @@ yuv_convert_sse2.$(OBJ_SUFFIX): CXXFLAGS += -msse2
20814N/A-yuv_convert_mmx.$(OBJ_SUFFIX): CXXFLAGS += -xarch=mmx -xO4
20814N/A+yuv_convert_mmx.$(OBJ_SUFFIX): CXXFLAGS += -xarch=sse -xO4
20814N/A yuv_convert_sse2.$(OBJ_SUFFIX): CXXFLAGS += -xarch=sse2 -xO4
20814N/A@@ -87,3 +87,10 @@ EXTRA_DSO_LDOPTS += \
20814N/A+ifeq (86,$(findstring 86,$(OS_TEST)))
20814N/A@@ -120,9 +120,11 @@ NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf,
20814N/A // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
20814N/A // C version does 8 at a time to mimic MMX code
20814N/A@@ -362,9 +364,12 @@ NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* y_buf,
20814N/A // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
20814N/A@@ -258,7 +258,7 @@ void LinearScaleYUVToRGB32Row(const uint8* y_buf,
20814N/A-#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
20814N/A+#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__) && !defined(__SUNPRO_CC)
20814N/A // PIC version is slower because less registers are available, so
20814N/A // non-PIC is used on platforms where it is possible.
20814N/A@@ -564,7 +564,7 @@ void LinearScaleYUVToRGB32Row(const uint8* y_buf,
20814N/A-#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
20814N/A+#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__) && !defined(__SUNPRO_CC)
20814N/A void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf,
20814N/A@@ -884,6 +884,128 @@ void LinearScaleYUVToRGB32Row(const uint8* y_buf,
20814N/A LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
20814N/A+#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__SUNPRO_CC)
20814N/A+void FastConvertYUVToRGB32Row_IL(const uint8* y_buf,
20814N/A+void FastConvertYUVToRGB32Row(const uint8* y_buf,
20814N/A+ if (mozilla::supports_sse()) {
20814N/A+ FastConvertYUVToRGB32Row_IL(y_buf, u_buf, v_buf, rgb_buf, width);
20814N/A+ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
20814N/A+void ScaleYUVToRGB32Row_IL(const uint8* y_buf,
20814N/A+void ScaleYUVToRGB32Row(const uint8* y_buf,
20814N/A+ if (mozilla::supports_sse()) {
20814N/A+ ScaleYUVToRGB32Row_IL(y_buf, u_buf, v_buf, rgb_buf,
20814N/A+ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
20814N/A+void LinearScaleYUVToRGB32Row_IL(const uint8* y_buf,
20814N/A+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
20814N/A+ if (mozilla::supports_sse()) {
20814N/A+ LinearScaleYUVToRGB32Row_IL(y_buf, u_buf, v_buf, rgb_buf,
20814N/A+ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
20814N/A+void PICConvertYUVToRGB32Row(const uint8* y_buf,
20814N/A+void FastConvertYUVToRGB32Row(const uint8* y_buf,
20814N/A+ PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
20814N/A+void PICScaleYUVToRGB32Row(const uint8* y_buf,
20814N/A+void ScaleYUVToRGB32Row(const uint8* y_buf,
20814N/A+ PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
20814N/A+void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
20814N/A+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
20814N/A+ PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
20814N/A void FastConvertYUVToRGB32Row(const uint8* y_buf,
20814N/A+! void FastConvertYUVToRGB32Row_IL(const uint8* y_buf,
20814N/A+ .inline FastConvertYUVToRGB32Row_IL, 20
20814N/A+ movq kCoefficientsRgbY+2048(,%eax,8),%mm0
20814N/A+ paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0
20814N/A+ movq kCoefficientsRgbY(,%eax,8),%mm1
20814N/A+ movq kCoefficientsRgbY(,%ebx,8),%mm2
20814N/A+ movq kCoefficientsRgbY+2048(,%eax,8),%mm0
20814N/A+ paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0
20814N/A+ movq kCoefficientsRgbY(,%eax,8),%mm1
20814N/A+! void ScaleYUVToRGB32Row_IL(const uint8* y_buf,
20814N/A+ .inline ScaleYUVToRGB32Row_IL, 24
20814N/A+ movq kCoefficientsRgbY+2048(,%eax,8),%mm0
20814N/A+ paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0
20814N/A+ movq kCoefficientsRgbY(,%eax,8),%mm1
20814N/A+ movq kCoefficientsRgbY(,%eax,8),%mm2
20814N/A+ movq kCoefficientsRgbY+2048(,%eax,8),%mm0
20814N/A+ paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0
20814N/A+ movq kCoefficientsRgbY(,%eax,8),%mm1
20814N/A+! void LinearScaleYUVToRGB32Row_IL(const uint8* y_buf,
20814N/A+ .inline LinearScaleYUVToRGB32Row_IL, 24
20814N/A+ ! source_width = width * source_dx + ebx
20814N/A+ movq kCoefficientsRgbY+2048(,%ecx,8),%mm0
20814N/A+ paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0
20814N/A+ movq kCoefficientsRgbY(,%ecx,8),%mm1
20814N/A+ movq kCoefficientsRgbY(,%ecx,8),%mm2
20814N/A+! void PICConvertYUVToRGB32Row(const uint8* y_buf,
20814N/A+ .inline PICConvertYUVToRGB32Row, 24
20814N/A+! void PICScaleYUVToRGB32Row(const uint8* y_buf,
20814N/A+ .inline PICScaleYUVToRGB32Row, 28
20814N/A+! void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
20814N/A+ .inline PICLinearScaleYUVToRGB32Row, 28
20814N/A+ ! source_width = width * source_dx + ebx
20814N/A@@ -226,6 +226,10 @@ SIMD_ALIGNED(int16 kCoefficientsRgbY[256 * 3][4]) = {
20814N/A RGBV(0xFC), RGBV(0xFD), RGBV(0xFE), RGBV(0xFF),
20814N/A+#pragma align 16 (kCoefficientsRgbY)