diff --git a/gfx/ycbcr/Makefile.in b/gfx/ycbcr/Makefile.in
index 4e72694..d4e9646 100644
--- a/gfx/ycbcr/Makefile.in
+++ b/gfx/ycbcr/Makefile.in
@@ -44,7 +44,7 @@ yuv_convert_sse2.$(OBJ_SUFFIX): CXXFLAGS += -msse2
endif
ifdef SOLARIS_SUNPRO_CXX
-yuv_convert_mmx.$(OBJ_SUFFIX): CXXFLAGS += -xarch=mmx -xO4
+yuv_convert_mmx.$(OBJ_SUFFIX): CXXFLAGS += -xarch=sse -xO4
yuv_convert_sse2.$(OBJ_SUFFIX): CXXFLAGS += -xarch=sse2 -xO4
endif
@@ -87,3 +87,10 @@ EXTRA_DSO_LDOPTS += \
$(NULL)
include $(topsrcdir)/config/rules.mk
+
+ifeq (86,$(findstring 86,$(OS_TEST)))
+ifdef SOLARIS_SUNPRO_CXX
+yuv_row_posix.$(OBJ_SUFFIX): yuv_row_posix.il
+yuv_row_posix.$(OBJ_SUFFIX): OS_CXXFLAGS += -xarch=sse -xO4 $(srcdir)/yuv_row_posix.il
+endif
+endif
diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
index 0e9e329..ae724a0 100644
--- a/gfx/ycbcr/yuv_convert.cpp
+++ b/gfx/ycbcr/yuv_convert.cpp
@@ -120,9 +120,11 @@ NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf,
}
}
+#ifdef ARCH_CPU_X86_FAMILY
// MMX used for FastConvertYUVToRGB32Row requires emms instruction.
if (has_sse)
EMMS();
+#endif
}
// C version does 8 at a time to mimic MMX code
@@ -362,9 +364,12 @@ NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* y_buf,
#endif
}
}
+
+#ifdef ARCH_CPU_X86_FAMILY
// MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
if (has_mmx)
EMMS();
+#endif
}
} // namespace gfx
diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp
index b359db4..eed4c15 100644
--- a/gfx/ycbcr/yuv_row_posix.cpp
+++ b/gfx/ycbcr/yuv_row_posix.cpp
@@ -258,7 +258,7 @@ void LinearScaleYUVToRGB32Row(const uint8* y_buf,
);
}
-#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
+#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__) && !defined(__SUNPRO_CC)
// PIC version is slower because less registers are available, so
// non-PIC is used on platforms where it is possible.
@@ -564,7 +564,7 @@ void LinearScaleYUVToRGB32Row(const uint8* y_buf,
width, source_dx);
}
-#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
+#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__) && !defined(__SUNPRO_CC)
void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf,
const uint8* u_buf,
@@ -884,6 +884,128 @@ void LinearScaleYUVToRGB32Row(const uint8* y_buf,
LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
}
+
+#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__SUNPRO_CC)
+
+void FastConvertYUVToRGB32Row_IL(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ if (mozilla::supports_sse()) {
+ FastConvertYUVToRGB32Row_IL(y_buf, u_buf, v_buf, rgb_buf, width);
+ return;
+ }
+
+ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
+}
+
+void ScaleYUVToRGB32Row_IL(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+ if (mozilla::supports_sse()) {
+ ScaleYUVToRGB32Row_IL(y_buf, u_buf, v_buf, rgb_buf,
+ width, source_dx);
+ return;
+ }
+
+ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
+ width, source_dx);
+}
+
+void LinearScaleYUVToRGB32Row_IL(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+ if (mozilla::supports_sse()) {
+ LinearScaleYUVToRGB32Row_IL(y_buf, u_buf, v_buf, rgb_buf,
+ width, source_dx);
+ return;
+ }
+
+ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
+ width, source_dx);
+}
+
+/*
+void PICConvertYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int16 *kCoefficientsRgbY);
+
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
+ &kCoefficientsRgbY[0][0]);
+}
+
+void PICScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx,
+ int16 *kCoefficientsRgbY);
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+ PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
+ &kCoefficientsRgbY[0][0]);
+}
+
+void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx,
+ int16 *kCoefficientsRgbY);
+
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+ PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
+ &kCoefficientsRgbY[0][0]);
+}
+*/
+
#else
void FastConvertYUVToRGB32Row(const uint8* y_buf,
const uint8* u_buf,
diff --git a/gfx/ycbcr/yuv_row_posix.il b/gfx/ycbcr/yuv_row_posix.il
new file mode 100644
index 0000000..faf6463
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_posix.il
@@ -0,0 +1,480 @@
+! void FastConvertYUVToRGB32Row_IL(const uint8* y_buf,
+! const uint8* u_buf,
+! const uint8* v_buf,
+! uint8* rgb_buf,
+! int width);
+ .inline FastConvertYUVToRGB32Row_IL, 20
+ pusha
+ mov 0x20(%esp),%edx
+ mov 0x24(%esp),%edi
+ mov 0x28(%esp),%esi
+ mov 0x2c(%esp),%ebp
+ mov 0x30(%esp),%ecx
+ jmp 1f
+
+0:
+ movzbl (%edi),%eax
+ add $0x1,%edi
+ movzbl (%esi),%ebx
+ add $0x1,%esi
+ movq kCoefficientsRgbY+2048(,%eax,8),%mm0
+ movzbl (%edx),%eax
+ paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0
+ movzbl 0x1(%edx),%ebx
+ movq kCoefficientsRgbY(,%eax,8),%mm1
+ add $0x2,%edx
+ movq kCoefficientsRgbY(,%ebx,8),%mm2
+ paddsw %mm0,%mm1
+ paddsw %mm0,%mm2
+ psraw $0x6,%mm1
+ psraw $0x6,%mm2
+ packuswb %mm2,%mm1
+ movntq %mm1,0x0(%ebp)
+ add $0x8,%ebp
+1:
+ sub $0x2,%ecx
+ jns 0b
+
+ and $0x1,%ecx
+ je 2f
+
+ movzbl (%edi),%eax
+ movq kCoefficientsRgbY+2048(,%eax,8),%mm0
+ movzbl (%esi),%eax
+ paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0
+ movzbl (%edx),%eax
+ movq kCoefficientsRgbY(,%eax,8),%mm1
+ paddsw %mm0,%mm1
+ psraw $0x6,%mm1
+ packuswb %mm1,%mm1
+ movd %mm1,0x0(%ebp)
+2:
+ popa
+ .end
+
+! void ScaleYUVToRGB32Row_IL(const uint8* y_buf,
+! const uint8* u_buf,
+! const uint8* v_buf,
+! uint8* rgb_buf,
+! int width,
+! int source_dx);
+ .inline ScaleYUVToRGB32Row_IL, 24
+ pusha
+ mov 0x20(%esp),%edx
+ mov 0x24(%esp),%edi
+ mov 0x28(%esp),%esi
+ mov 0x2c(%esp),%ebp
+ mov 0x30(%esp),%ecx
+ xor %ebx,%ebx
+ jmp 1f
+
+0:
+ mov %ebx,%eax
+ sar $0x11,%eax
+ movzbl (%edi,%eax,1),%eax
+ movq kCoefficientsRgbY+2048(,%eax,8),%mm0
+ mov %ebx,%eax
+ sar $0x11,%eax
+ movzbl (%esi,%eax,1),%eax
+ paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0
+ mov %ebx,%eax
+ add 0x34(%esp),%ebx
+ sar $0x10,%eax
+ movzbl (%edx,%eax,1),%eax
+ movq kCoefficientsRgbY(,%eax,8),%mm1
+ mov %ebx,%eax
+ add 0x34(%esp),%ebx
+ sar $0x10,%eax
+ movzbl (%edx,%eax,1),%eax
+ movq kCoefficientsRgbY(,%eax,8),%mm2
+ paddsw %mm0,%mm1
+ paddsw %mm0,%mm2
+ psraw $0x6,%mm1
+ psraw $0x6,%mm2
+ packuswb %mm2,%mm1
+ movntq %mm1,0x0(%ebp)
+ add $0x8,%ebp
+1:
+ sub $0x2,%ecx
+ jns 0b
+
+ and $0x1,%ecx
+ je 2f
+
+ mov %ebx,%eax
+ sar $0x11,%eax
+ movzbl (%edi,%eax,1),%eax
+ movq kCoefficientsRgbY+2048(,%eax,8),%mm0
+ mov %ebx,%eax
+ sar $0x11,%eax
+ movzbl (%esi,%eax,1),%eax
+ paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0
+ mov %ebx,%eax
+ sar $0x10,%eax
+ movzbl (%edx,%eax,1),%eax
+ movq kCoefficientsRgbY(,%eax,8),%mm1
+ paddsw %mm0,%mm1
+ psraw $0x6,%mm1
+ packuswb %mm1,%mm1
+ movd %mm1,0x0(%ebp)
+
+2:
+ popa
+ .end
+
+! void LinearScaleYUVToRGB32Row_IL(const uint8* y_buf,
+! const uint8* u_buf,
+! const uint8* v_buf,
+! uint8* rgb_buf,
+! int width,
+! int source_dx);
+ .inline LinearScaleYUVToRGB32Row_IL, 24
+ pusha
+ mov 0x20(%esp),%edx
+ mov 0x24(%esp),%edi
+ mov 0x2c(%esp),%ebp
+
+ ! source_width = width * source_dx + ebx
+ mov 0x30(%esp), %ecx
+ imull 0x34(%esp), %ecx
+ mov %ecx, 0x30(%esp)
+
+ mov 0x34(%esp), %ecx
+ xor %ebx,%ebx
+ ! x = 0
+ cmp $0x20000,%ecx
+ ! if source_dx >= 2.0
+ jl 1f
+ mov $0x8000,%ebx
+ ! x = 0.5 for 1/2 or less
+ jmp 1f
+
+0:
+ mov %ebx,%eax
+ sar $0x11,%eax
+
+ movzbl (%edi,%eax,1),%ecx
+ movzbl 1(%edi,%eax,1),%esi
+ mov %ebx,%eax
+ andl $0x1fffe, %eax
+ imul %eax, %esi
+ xorl $0x1fffe, %eax
+ imul %eax, %ecx
+ addl %esi, %ecx
+ shrl $17, %ecx
+ movq kCoefficientsRgbY+2048(,%ecx,8),%mm0
+
+ mov 0x28(%esp),%esi
+ mov %ebx,%eax
+ sar $0x11,%eax
+
+ movzbl (%esi,%eax,1),%ecx
+ movzbl 1(%esi,%eax,1),%esi
+ mov %ebx,%eax
+ andl $0x1fffe, %eax
+ imul %eax, %esi
+ xorl $0x1fffe, %eax
+ imul %eax, %ecx
+ addl %esi, %ecx
+ shrl $17, %ecx
+ paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0
+
+ mov %ebx,%eax
+ sar $0x10,%eax
+ movzbl (%edx,%eax,1),%ecx
+ movzbl 1(%edx,%eax,1),%esi
+ mov %ebx,%eax
+ add 0x34(%esp),%ebx
+ andl $0xffff, %eax
+ imul %eax, %esi
+ xorl $0xffff, %eax
+ imul %eax, %ecx
+ addl %esi, %ecx
+ shrl $16, %ecx
+ movq kCoefficientsRgbY(,%ecx,8),%mm1
+
+ cmp 0x30(%esp), %ebx
+ jge 2f
+
+ mov %ebx,%eax
+ sar $0x10,%eax
+ movzbl (%edx,%eax,1),%ecx
+ movzbl 1(%edx,%eax,1),%esi
+ mov %ebx,%eax
+ add 0x34(%esp),%ebx
+ andl $0xffff, %eax
+ imul %eax, %esi
+ xorl $0xffff, %eax
+ imul %eax, %ecx
+ addl %esi, %ecx
+ shrl $16, %ecx
+ movq kCoefficientsRgbY(,%ecx,8),%mm2
+
+ paddsw %mm0,%mm1
+ paddsw %mm0,%mm2
+ psraw $0x6,%mm1
+ psraw $0x6,%mm2
+ packuswb %mm2,%mm1
+ movntq %mm1,0x0(%ebp)
+ add $0x8,%ebp
+
+1:
+ cmp 0x30(%esp), %ebx
+ jl 0b
+ jmp 3f
+
+2:
+ paddsw %mm0, %mm1
+ psraw $6, %mm1
+ packuswb %mm1, %mm1
+ movd %mm1, (%ebp)
+
+3:
+ popa
+ .end
+
+! void PICConvertYUVToRGB32Row(const uint8* y_buf,
+! const uint8* u_buf,
+! const uint8* v_buf,
+! uint8* rgb_buf,
+! int width,
+! int16 *kCoefficientsRgbY);
+
+ .inline PICConvertYUVToRGB32Row, 24
+ pusha
+ mov 0x20(%esp),%edx
+ mov 0x24(%esp),%edi
+ mov 0x28(%esp),%esi
+ mov 0x2c(%esp),%ebp
+ mov 0x34(%esp),%ecx
+
+ jmp 1f
+
+0:
+ movzbl (%edi),%eax
+ add $0x1,%edi
+ movzbl (%esi),%ebx
+ add $0x1,%esi
+ movq 2048(%ecx,%eax,8),%mm0
+ movzbl (%edx),%eax
+ paddsw 4096(%ecx,%ebx,8),%mm0
+ movzbl 0x1(%edx),%ebx
+ movq 0(%ecx,%eax,8),%mm1
+ add $0x2,%edx
+ movq 0(%ecx,%ebx,8),%mm2
+ paddsw %mm0,%mm1
+ paddsw %mm0,%mm2
+ psraw $0x6,%mm1
+ psraw $0x6,%mm2
+ packuswb %mm2,%mm1
+ movntq %mm1,0x0(%ebp)
+ add $0x8,%ebp
+1:
+ subl $0x2,0x30(%esp)
+ jns 0b
+
+ andl $0x1,0x30(%esp)
+ je 2f
+
+ movzbl (%edi),%eax
+ movq 2048(%ecx,%eax,8),%mm0
+ movzbl (%esi),%eax
+ paddsw 4096(%ecx,%eax,8),%mm0
+ movzbl (%edx),%eax
+ movq 0(%ecx,%eax,8),%mm1
+ paddsw %mm0,%mm1
+ psraw $0x6,%mm1
+ packuswb %mm1,%mm1
+ movd %mm1,0x0(%ebp)
+2:
+ popa
+ .end
+
+
+! void PICScaleYUVToRGB32Row(const uint8* y_buf,
+! const uint8* u_buf,
+! const uint8* v_buf,
+! uint8* rgb_buf,
+! int width,
+! int source_dx,
+! int16 *kCoefficientsRgbY);
+
+ .inline PICScaleYUVToRGB32Row, 28
+ pusha
+ mov 0x20(%esp),%edx
+ mov 0x24(%esp),%edi
+ mov 0x28(%esp),%esi
+ mov 0x2c(%esp),%ebp
+ mov 0x38(%esp),%ecx
+ xor %ebx,%ebx
+ jmp 1f
+
+0:
+ mov %ebx,%eax
+ sar $0x11,%eax
+ movzbl (%edi,%eax,1),%eax
+ movq 2048(%ecx,%eax,8),%mm0
+ mov %ebx,%eax
+ sar $0x11,%eax
+ movzbl (%esi,%eax,1),%eax
+ paddsw 4096(%ecx,%eax,8),%mm0
+ mov %ebx,%eax
+ add 0x34(%esp),%ebx
+ sar $0x10,%eax
+ movzbl (%edx,%eax,1),%eax
+ movq 0(%ecx,%eax,8),%mm1
+ mov %ebx,%eax
+ add 0x34(%esp),%ebx
+ sar $0x10,%eax
+ movzbl (%edx,%eax,1),%eax
+ movq 0(%ecx,%eax,8),%mm2
+ paddsw %mm0,%mm1
+ paddsw %mm0,%mm2
+ psraw $0x6,%mm1
+ psraw $0x6,%mm2
+ packuswb %mm2,%mm1
+ movntq %mm1,0x0(%ebp)
+ add $0x8,%ebp
+1:
+ subl $0x2,0x30(%esp)
+ jns 0b
+
+ andl $0x1,0x30(%esp)
+ je 2f
+
+ mov %ebx,%eax
+ sar $0x11,%eax
+ movzbl (%edi,%eax,1),%eax
+ movq 2048(%ecx,%eax,8),%mm0
+ mov %ebx,%eax
+ sar $0x11,%eax
+ movzbl (%esi,%eax,1),%eax
+ paddsw 4096(%ecx,%eax,8),%mm0
+ mov %ebx,%eax
+ sar $0x10,%eax
+ movzbl (%edx,%eax,1),%eax
+ movq 0(%ecx,%eax,8),%mm1
+ paddsw %mm0,%mm1
+ psraw $0x6,%mm1
+ packuswb %mm1,%mm1
+ movd %mm1,0x0(%ebp)
+
+2:
+ popa
+ .end
+
+
+! void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
+! const uint8* u_buf,
+! const uint8* v_buf,
+! uint8* rgb_buf,
+! int width,
+! int source_dx,
+! int16 *kCoefficientsRgbY);
+ .inline PICLinearScaleYUVToRGB32Row, 28
+ pusha
+ mov 0x20(%esp),%edx
+ mov 0x2c(%esp),%ebp
+ mov 0x30(%esp),%ecx
+ mov 0x38(%esp),%edi
+ xor %ebx,%ebx
+
+ ! source_width = width * source_dx + ebx
+ mov 0x30(%esp), %ecx
+ imull 0x34(%esp), %ecx
+ mov %ecx, 0x30(%esp)
+
+ mov 0x34(%esp), %ecx
+ xor %ebx,%ebx
+ ! x = 0
+ cmp $0x20000,%ecx
+ ! if source_dx >= 2.0
+ jl 1f
+ mov $0x8000,%ebx
+ ! x = 0.5 for 1/2 or less
+ jmp 1f
+
+0:
+ mov 0x24(%esp),%esi
+ mov %ebx,%eax
+ sar $0x11,%eax
+
+ movzbl (%esi,%eax,1),%ecx
+ movzbl 1(%esi,%eax,1),%esi
+ mov %ebx,%eax
+ andl $0x1fffe, %eax
+ imul %eax, %esi
+ xorl $0x1fffe, %eax
+ imul %eax, %ecx
+ addl %esi, %ecx
+ shrl $17, %ecx
+ movq 2048(%edi,%ecx,8),%mm0
+
+ mov 0x28(%esp),%esi
+ mov %ebx,%eax
+ sar $0x11,%eax
+
+ movzbl (%esi,%eax,1),%ecx
+ movzbl 1(%esi,%eax,1),%esi
+ mov %ebx,%eax
+ andl $0x1fffe, %eax
+ imul %eax, %esi
+ xorl $0x1fffe, %eax
+ imul %eax, %ecx
+ addl %esi, %ecx
+ shrl $17, %ecx
+ paddsw 4096(%edi,%ecx,8),%mm0
+
+ mov %ebx,%eax
+ sar $0x10,%eax
+ movzbl (%edx,%eax,1),%ecx
+ movzbl 1(%edx,%eax,1),%esi
+ mov %ebx,%eax
+ add 0x34(%esp),%ebx
+ andl $0xffff, %eax
+ imul %eax, %esi
+ xorl $0xffff, %eax
+ imul %eax, %ecx
+ addl %esi, %ecx
+ shrl $16, %ecx
+ movq (%edi,%ecx,8),%mm1
+
+ cmp 0x30(%esp), %ebx
+ jge 2f
+
+ mov %ebx,%eax
+ sar $0x10,%eax
+ movzbl (%edx,%eax,1),%ecx
+ movzbl 1(%edx,%eax,1),%esi
+ mov %ebx,%eax
+ add 0x34(%esp),%ebx
+ andl $0xffff, %eax
+ imul %eax, %esi
+ xorl $0xffff, %eax
+ imul %eax, %ecx
+ addl %esi, %ecx
+ shrl $16, %ecx
+ movq (%edi,%ecx,8),%mm2
+
+ paddsw %mm0,%mm1
+ paddsw %mm0,%mm2
+ psraw $0x6,%mm1
+ psraw $0x6,%mm2
+ packuswb %mm2,%mm1
+ movntq %mm1,0x0(%ebp)
+ add $0x8,%ebp
+
+1:
+ cmp %ebx, 0x30(%esp)
+ jg 0b
+ jmp 3f
+
+2:
+ paddsw %mm0, %mm1
+ psraw $6, %mm1
+ packuswb %mm1, %mm1
+ movd %mm1, (%ebp)
+
+3:
+ popa
+ .end
diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp
index ad71341..518e947 100644
--- a/gfx/ycbcr/yuv_row_table.cpp
+++ b/gfx/ycbcr/yuv_row_table.cpp
@@ -226,6 +226,10 @@ SIMD_ALIGNED(int16 kCoefficientsRgbY[256 * 3][4]) = {
RGBV(0xFC), RGBV(0xFD), RGBV(0xFE), RGBV(0xFF),
};
+#ifdef __SUNPRO_CC
+#pragma align 16 (kCoefficientsRgbY)
+#endif
+
#undef RGBY
#undef RGBU
#undef RGBV