/*
* Copyright (c) 1998, 2003, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
* FILENAME: mlib_ImageChannelExtract_1.c
*
* FUNCTIONS
* mlib_v_ImageChannelExtract_U8_21_A8D1X8
* mlib_v_ImageChannelExtract_U8_21_A8D2X8
* mlib_v_ImageChannelExtract_U8_21_D1
* mlib_v_ImageChannelExtract_U8_21
* mlib_v_ImageChannelExtract_U8_31_A8D1X8
* mlib_v_ImageChannelExtract_U8_31_A8D2X8
* mlib_v_ImageChannelExtract_U8_31_D1
* mlib_v_ImageChannelExtract_U8_31
* mlib_v_ImageChannelExtract_U8_41_A8D1X8
* mlib_v_ImageChannelExtract_U8_41_A8D2X8
* mlib_v_ImageChannelExtract_U8_41_D1
* mlib_v_ImageChannelExtract_U8_41
* mlib_v_ImageChannelExtract_S16_21_A8D1X4
* mlib_v_ImageChannelExtract_S16_21_A8D2X4
* mlib_v_ImageChannelExtract_S16_21_D1
* mlib_v_ImageChannelExtract_S16_21
* mlib_v_ImageChannelExtract_S16_31_A8D1X4
* mlib_v_ImageChannelExtract_S16_31_A8D2X4
* mlib_v_ImageChannelExtract_S16_31_D1
* mlib_v_ImageChannelExtract_S16_31
* mlib_v_ImageChannelExtract_S16_41_A8D1X4
* mlib_v_ImageChannelExtract_S16_41_A8D2X4
* mlib_v_ImageChannelExtract_S16_41_D1
* mlib_v_ImageChannelExtract_S16_41
*
* ARGUMENT
* src pointer to source image data
* dst pointer to destination image data
* slb source image line stride in bytes
* dlb destination image line stride in bytes
* dsize image data size in pixels
* xsize image width in pixels
* ysize image height in lines
* cmask channel mask
*
* DESCRIPTION
* Extract the one selected channel of the source image into the
* 1-channel destination image.
*
* NOTE
* These functions are separated from mlib_ImageChannelExtract.c
* for loop unrolling and structure clarity.
*/
#include "vis_proto.h"
#include "mlib_image.h"
#include "mlib_v_ImageChannelExtract.h"
/***************************************************************/
#define CHANNELEXTRACT_U8_21L(sd0, sd1, dd) \
sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd1)); \
sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd1)); \
sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdb)); \
sdd = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdb)); \
dd = vis_fpmerge(vis_read_hi(sdc), vis_read_hi(sdd))
/***************************************************************/
#define CHANNELEXTRACT_U8_21R(sd0, sd1, dd) \
sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd1)); \
sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd1)); \
sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdb)); \
sdd = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdb)); \
dd = vis_fpmerge(vis_read_lo(sdc), vis_read_lo(sdd))
/***************************************************************/
/* extract one channel from a 2-channel image.
* both source and destination image data are 8-byte aligned.
* xsize is multiple of 8.
*/
void mlib_v_ImageChannelExtract_U8_21_A8D1X8(const mlib_u8 *src,
mlib_u8 *dst,
mlib_s32 dsize,
mlib_s32 cmask)
{
mlib_d64 *sp, *dp;
mlib_d64 sd0, sd1;
mlib_d64 sda, sdb, sdc, sdd;
mlib_d64 dd;
mlib_s32 i;
sp = (mlib_d64 *) src;
dp = (mlib_d64 *) dst;
if (cmask == 2) {
#pragma pipeloop(0)
for (i = 0; i < dsize / 8; i++) {
sd0 = *sp++;
sd1 = *sp++;
CHANNELEXTRACT_U8_21L(sd0, sd1, dd);
*dp++ = dd;
}
}
else {
#pragma pipeloop(0)
for (i = 0; i < dsize / 8; i++) {
sd0 = *sp++;
sd1 = *sp++;
CHANNELEXTRACT_U8_21R(sd0, sd1, dd);
*dp++ = dd;
}
}
}
/***************************************************************/
/* extract one channel from a 2-channel image.
* both source and destination image data are 8-byte aligned.
* xsize is multiple of 8.
*/
void mlib_v_ImageChannelExtract_U8_21_A8D2X8(const mlib_u8 *src,
mlib_s32 slb,
mlib_u8 *dst,
mlib_s32 dlb,
mlib_s32 xsize,
mlib_s32 ysize,
mlib_s32 cmask)
{
mlib_d64 *sp, *dp;
mlib_d64 *sl, *dl;
mlib_d64 sd0, sd1;
mlib_d64 sda, sdb, sdc, sdd;
mlib_d64 dd;
mlib_s32 i, j;
sp = sl = (mlib_d64 *) src;
dp = dl = (mlib_d64 *) dst;
if (cmask == 2) {
for (j = 0; j < ysize; j++) {
#pragma pipeloop(0)
for (i = 0; i < xsize / 8; i++) {
sd0 = *sp++;
sd1 = *sp++;
CHANNELEXTRACT_U8_21L(sd0, sd1, dd);
*dp++ = dd;
}
sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
}
}
else {
for (j = 0; j < ysize; j++) {
#pragma pipeloop(0)
for (i = 0; i < xsize / 8; i++) {
sd0 = *sp++;
sd1 = *sp++;
CHANNELEXTRACT_U8_21R(sd0, sd1, dd);
*dp++ = dd;
}
sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
}
}
}
/***************************************************************/
/* extract one channel from a 2-channel image.
*/
void mlib_v_ImageChannelExtract_U8_21_D1(const mlib_u8 *src,
mlib_u8 *dst,
mlib_s32 dsize,
mlib_s32 cmask)
{
mlib_u8 *sa, *da;
mlib_u8 *dend, *dend2; /* end points in dst */
mlib_d64 *dp; /* 8-byte aligned start points in dst */
mlib_d64 *sp; /* 8-byte aligned start point in src */
mlib_d64 sd0, sd1, sd2, sd3; /* 8-byte source data */
mlib_d64 sda, sdb, sdc, sdd;
mlib_d64 dd0, dd1;
mlib_s32 soff; /* offset of address in src */
mlib_s32 doff; /* offset of address in dst */
mlib_s32 off; /* offset of src over dst */
mlib_s32 emask; /* edge mask */
mlib_s32 i, n;
sa = (void *)src;
da = dst;
/* prepare the source address */
sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
soff = ((mlib_addr) sa & 7);
/* prepare the destination addresses */
dp = (mlib_d64 *) ((mlib_addr) da & (~7));
doff = ((mlib_addr) da & 7);
dend = da + dsize - 1;
dend2 = dend - 7;
/* calculate the src's offset over dst */
if (cmask == 2) {
off = soff / 2 - doff;
}
else {
off = (soff + 1) / 2 - doff;
}
if (((cmask == 2) && (soff % 2 == 0)) || ((cmask == 1) && (soff % 2 != 0))) { /* extract even bytes */
if (off == 0) { /* src and dst have same alignment */
/* generate edge mask for the start point */
emask = vis_edge8(da, dend);
/* load 16 bytes */
sd0 = *sp++;
sd1 = *sp++;
/* extract, including some garbage at the start point */
CHANNELEXTRACT_U8_21L(sd0, sd1, dd0);
/* store 8 bytes result */
vis_pst_8(dd0, dp++, emask);
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
sd0 = *sp++;
sd1 = *sp++;
CHANNELEXTRACT_U8_21L(sd0, sd1, dd0);
*dp++ = dd0;
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge8(dp, dend);
sd0 = *sp++;
sd1 = *sp++;
CHANNELEXTRACT_U8_21L(sd0, sd1, dd0);
vis_pst_8(dd0, dp++, emask);
}
}
else {
vis_alignaddr((void *)0, off);
/* generate edge mask for the start point */
emask = vis_edge8(da, dend);
if (off < 0) {
/* load 16 bytes */
sd2 = *sp++;
sd3 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_U8_21L(sd2, sd3, dd1);
vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask);
}
else {
/* load 32 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_U8_21L(sd0, sd1, dd0);
CHANNELEXTRACT_U8_21L(sd2, sd3, dd1);
vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
}
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
dd0 = dd1;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_U8_21L(sd2, sd3, dd1);
*dp++ = vis_faligndata(dd0, dd1);
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge8(dp, dend);
dd0 = dd1;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_U8_21L(sd2, sd3, dd1);
vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
}
}
}
else { /* extract odd bytes */
if (off == 0) { /* src and dst have same alignment */
/* generate edge mask for the start point */
emask = vis_edge8(da, dend);
/* load 16 bytes, don't care the garbage at the start point */
sd0 = *sp++;
sd1 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_U8_21R(sd0, sd1, dd0);
vis_pst_8(dd0, dp++, emask);
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
sd0 = *sp++;
sd1 = *sp++;
CHANNELEXTRACT_U8_21R(sd0, sd1, dd0);
*dp++ = dd0;
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge8(dp, dend);
sd0 = *sp++;
sd1 = *sp++;
CHANNELEXTRACT_U8_21R(sd0, sd1, dd0);
vis_pst_8(dd0, dp++, emask);
}
}
else {
vis_alignaddr((void *)0, off);
/* generate edge mask for the start point */
emask = vis_edge8(da, dend);
if (off < 0) {
/* load 16 bytes */
sd2 = *sp++;
sd3 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_U8_21R(sd2, sd3, dd1);
vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask);
}
else {
/* load 32 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_U8_21R(sd0, sd1, dd0);
CHANNELEXTRACT_U8_21R(sd2, sd3, dd1);
vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
}
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
dd0 = dd1;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_U8_21R(sd2, sd3, dd1);
*dp++ = vis_faligndata(dd0, dd1);
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge8(dp, dend);
dd0 = dd1;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_U8_21R(sd2, sd3, dd1);
vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
}
}
}
}
/***************************************************************/
/* extract one channel from a 2-channel image.
*/
void mlib_v_ImageChannelExtract_U8_21(const mlib_u8 *src,
mlib_s32 slb,
mlib_u8 *dst,
mlib_s32 dlb,
mlib_s32 xsize,
mlib_s32 ysize,
mlib_s32 cmask)
{
mlib_u8 *sa, *da;
mlib_u8 *sl, *dl;
mlib_s32 j;
sa = sl = (void *)src;
da = dl = dst;
for (j = 0; j < ysize; j++) {
mlib_v_ImageChannelExtract_U8_21_D1(sa, da, xsize, cmask);
sa = sl += slb;
da = dl += dlb;
}
}
/***************************************************************/
#define CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd) \
sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); \
sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); \
sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); \
sdd = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb)); \
sde = vis_fpmerge(vis_read_lo(sda), vis_read_hi(sdc)); \
dd = vis_fpmerge(vis_read_hi(sdd), vis_read_lo(sde))
/***************************************************************/
#define CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd) \
sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); \
sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); \
sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); \
sdd = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb)); \
sde = vis_fpmerge(vis_read_hi(sdb), vis_read_lo(sdc)); \
dd = vis_fpmerge(vis_read_lo(sdd), vis_read_hi(sde))
/***************************************************************/
#define CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd) \
sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); \
sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); \
sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); \
sdd = vis_fpmerge(vis_read_lo(sda), vis_read_hi(sdc)); \
sde = vis_fpmerge(vis_read_hi(sdb), vis_read_lo(sdc)); \
dd = vis_fpmerge(vis_read_hi(sdd), vis_read_lo(sde))
/***************************************************************/
void mlib_v_ImageChannelExtract_U8_31_A8D1X8(const mlib_u8 *src,
mlib_u8 *dst,
mlib_s32 dsize,
mlib_s32 cmask)
{
mlib_d64 *sp, *dp;
mlib_d64 sd0, sd1, sd2;
mlib_d64 sda, sdb, sdc, sdd, sde;
mlib_d64 dd;
mlib_s32 i;
sp = (mlib_d64 *) src;
dp = (mlib_d64 *) dst;
if (cmask == 4) {
#pragma pipeloop(0)
for (i = 0; i < dsize / 8; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd);
*dp++ = dd;
}
}
else if (cmask == 2) {
#pragma pipeloop(0)
for (i = 0; i < dsize / 8; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd);
*dp++ = dd;
}
}
else {
#pragma pipeloop(0)
for (i = 0; i < dsize / 8; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd);
*dp++ = dd;
}
}
}
/***************************************************************/
void mlib_v_ImageChannelExtract_U8_31_A8D2X8(const mlib_u8 *src,
mlib_s32 slb,
mlib_u8 *dst,
mlib_s32 dlb,
mlib_s32 xsize,
mlib_s32 ysize,
mlib_s32 cmask)
{
mlib_d64 *sp, *dp;
mlib_d64 *sl, *dl;
mlib_d64 sd0, sd1, sd2;
mlib_d64 sda, sdb, sdc, sdd, sde;
mlib_d64 dd;
mlib_s32 i, j;
sp = sl = (mlib_d64 *) src;
dp = dl = (mlib_d64 *) dst;
if (cmask == 4) {
for (j = 0; j < ysize; j++) {
#pragma pipeloop(0)
for (i = 0; i < xsize / 8; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd);
*dp++ = dd;
}
sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
}
}
else if (cmask == 2) {
for (j = 0; j < ysize; j++) {
#pragma pipeloop(0)
for (i = 0; i < xsize / 8; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd);
*dp++ = dd;
}
sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
}
}
else {
for (j = 0; j < ysize; j++) {
#pragma pipeloop(0)
for (i = 0; i < xsize / 8; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd);
*dp++ = dd;
}
sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
}
}
}
/***************************************************************/
void mlib_v_ImageChannelExtract_U8_31_D1(const mlib_u8 *src,
mlib_u8 *dst,
mlib_s32 dsize,
mlib_s32 cmask)
{
mlib_u8 *sa, *da;
mlib_u8 *dend, *dend2; /* end points in dst */
mlib_d64 *dp; /* 8-byte aligned start points in dst */
mlib_d64 *sp; /* 8-byte aligned start point in src */
mlib_d64 sd0, sd1, sd2; /* 8-byte source data */
mlib_d64 sd3, sd4, sd5;
mlib_d64 sda, sdb, sdc, sdd, sde;
mlib_d64 dd0, dd1;
mlib_s32 soff; /* offset of address in src */
mlib_s32 doff; /* offset of address in dst */
mlib_s32 off; /* offset of src over dst */
mlib_s32 emask; /* edge mask */
mlib_s32 i, n;
sa = (void *)src;
da = dst;
/* prepare the source address */
sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
soff = ((mlib_addr) sa & 7);
/* prepare the destination addresses */
dp = (mlib_d64 *) ((mlib_addr) da & (~7));
doff = ((mlib_addr) da & 7);
dend = da + dsize - 1;
dend2 = dend - 7;
/* calculate the src's offset over dst */
if (cmask == 4) {
off = soff / 3 - doff;
}
else if (cmask == 2) {
off = (soff + 1) / 3 - doff;
}
else {
off = (soff + 2) / 3 - doff;
}
if (((cmask == 4) && (soff % 3 == 0)) ||
((cmask == 2) && (soff % 3 == 2)) ||
((cmask == 1) && (soff % 3 == 1))) { /* extract left channel */
if (off == 0) { /* src and dst have same alignment */
/* generate edge mask for the start point */
emask = vis_edge8(da, dend);
/* load 16 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
/* extract, including some garbage at the start point */
CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd0);
/* store 8 bytes result */
vis_pst_8(dd0, dp++, emask);
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd0);
*dp++ = dd0;
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge8(dp, dend);
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd0);
vis_pst_8(dd0, dp++, emask);
}
}
else {
vis_alignaddr((void *)0, off);
/* generate edge mask for the start point */
emask = vis_edge8(da, dend);
if (off < 0) {
/* load 24 bytes */
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_U8_31L(sd3, sd4, sd5, dd1);
vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask);
}
else {
/* load 48 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_U8_31L(sd0, sd1, sd2, dd0);
CHANNELEXTRACT_U8_31L(sd3, sd4, sd5, dd1);
vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
}
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
dd0 = dd1;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
CHANNELEXTRACT_U8_31L(sd3, sd4, sd5, dd1);
*dp++ = vis_faligndata(dd0, dd1);
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge8(dp, dend);
dd0 = dd1;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
CHANNELEXTRACT_U8_31L(sd3, sd4, sd5, dd1);
vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
}
}
}
else if (((cmask == 4) && (soff % 3 == 1)) ||
((cmask == 2) && (soff % 3 == 0)) ||
((cmask == 1) && (soff % 3 == 2))) {
/* extract middle channel */
if (off == 0) { /* src and dst have same alignment */
/* generate edge mask for the start point */
emask = vis_edge8(da, dend);
/* load 16 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
/* extract, including some garbage at the start point */
CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd0);
/* store 8 bytes result */
vis_pst_8(dd0, dp++, emask);
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd0);
*dp++ = dd0;
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge8(dp, dend);
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd0);
vis_pst_8(dd0, dp++, emask);
}
}
else {
vis_alignaddr((void *)0, off);
/* generate edge mask for the start point */
emask = vis_edge8(da, dend);
if (off < 0) {
/* load 24 bytes */
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_U8_31M(sd3, sd4, sd5, dd1);
vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask);
}
else {
/* load 48 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_U8_31M(sd0, sd1, sd2, dd0);
CHANNELEXTRACT_U8_31M(sd3, sd4, sd5, dd1);
vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
}
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
dd0 = dd1;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
CHANNELEXTRACT_U8_31M(sd3, sd4, sd5, dd1);
*dp++ = vis_faligndata(dd0, dd1);
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge8(dp, dend);
dd0 = dd1;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
CHANNELEXTRACT_U8_31M(sd3, sd4, sd5, dd1);
vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
}
}
}
else { /* extract right channel */
if (off == 0) { /* src and dst have same alignment */
/* generate edge mask for the start point */
emask = vis_edge8(da, dend);
/* load 16 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
/* extract, including some garbage at the start point */
CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd0);
/* store 8 bytes result */
vis_pst_8(dd0, dp++, emask);
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd0);
*dp++ = dd0;
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge8(dp, dend);
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd0);
vis_pst_8(dd0, dp++, emask);
}
}
else {
vis_alignaddr((void *)0, off);
/* generate edge mask for the start point */
emask = vis_edge8(da, dend);
if (off < 0) {
/* load 24 bytes */
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_U8_31R(sd3, sd4, sd5, dd1);
vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask);
}
else {
/* load 48 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_U8_31R(sd0, sd1, sd2, dd0);
CHANNELEXTRACT_U8_31R(sd3, sd4, sd5, dd1);
vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
}
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
dd0 = dd1;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
CHANNELEXTRACT_U8_31R(sd3, sd4, sd5, dd1);
*dp++ = vis_faligndata(dd0, dd1);
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge8(dp, dend);
dd0 = dd1;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
CHANNELEXTRACT_U8_31R(sd3, sd4, sd5, dd1);
vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
}
}
}
}
/***************************************************************/
void mlib_v_ImageChannelExtract_U8_31(const mlib_u8 *src,
mlib_s32 slb,
mlib_u8 *dst,
mlib_s32 dlb,
mlib_s32 xsize,
mlib_s32 ysize,
mlib_s32 cmask)
{
mlib_u8 *sa, *da;
mlib_u8 *sl, *dl;
mlib_s32 j;
sa = sl = (void *)src;
da = dl = dst;
for (j = 0; j < ysize; j++) {
mlib_v_ImageChannelExtract_U8_31_D1(sa, da, xsize, cmask);
sa = sl += slb;
da = dl += dlb;
}
}
/***************************************************************/
#define CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd) \
sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); \
sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); \
sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); \
sdd = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); \
sde = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdc)); \
sdf = vis_fpmerge(vis_read_hi(sdb), vis_read_hi(sdd)); \
dd = vis_fpmerge(vis_read_hi(sde), vis_read_hi(sdf))
/***************************************************************/
#define CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd) \
sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); \
sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); \
sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); \
sdd = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); \
sde = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdc)); \
sdf = vis_fpmerge(vis_read_hi(sdb), vis_read_hi(sdd)); \
dd = vis_fpmerge(vis_read_lo(sde), vis_read_lo(sdf))
/***************************************************************/
#define CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd) \
sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); \
sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); \
sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); \
sdd = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); \
sde = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdc)); \
sdf = vis_fpmerge(vis_read_lo(sdb), vis_read_lo(sdd)); \
dd = vis_fpmerge(vis_read_hi(sde), vis_read_hi(sdf))
/***************************************************************/
#define CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd) \
sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); \
sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); \
sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); \
sdd = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); \
sde = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdc)); \
sdf = vis_fpmerge(vis_read_lo(sdb), vis_read_lo(sdd)); \
dd = vis_fpmerge(vis_read_lo(sde), vis_read_lo(sdf))
/***************************************************************/
void mlib_v_ImageChannelExtract_U8_41_A8D1X8(const mlib_u8 *src,
mlib_u8 *dst,
mlib_s32 dsize,
mlib_s32 cmask)
{
mlib_d64 *sp, *dp;
mlib_d64 sd0, sd1, sd2, sd3;
mlib_d64 sda, sdb, sdc, sdd, sde, sdf;
mlib_d64 dd;
mlib_s32 i;
sp = (mlib_d64 *) src;
dp = (mlib_d64 *) dst;
if (cmask == 8) {
#pragma pipeloop(0)
for (i = 0; i < dsize / 8; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd);
*dp++ = dd;
}
}
else if (cmask == 4) {
#pragma pipeloop(0)
for (i = 0; i < dsize / 8; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd);
*dp++ = dd;
}
}
else if (cmask == 2) {
#pragma pipeloop(0)
for (i = 0; i < dsize / 8; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd);
*dp++ = dd;
}
}
else {
#pragma pipeloop(0)
for (i = 0; i < dsize / 8; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd);
*dp++ = dd;
}
}
}
/***************************************************************/
void mlib_v_ImageChannelExtract_U8_41_A8D2X8(const mlib_u8 *src,
mlib_s32 slb,
mlib_u8 *dst,
mlib_s32 dlb,
mlib_s32 xsize,
mlib_s32 ysize,
mlib_s32 cmask)
{
mlib_d64 *sp, *dp;
mlib_d64 *sl, *dl;
mlib_d64 sd0, sd1, sd2, sd3;
mlib_d64 sda, sdb, sdc, sdd, sde, sdf;
mlib_d64 dd;
mlib_s32 i, j;
sp = sl = (mlib_d64 *) src;
dp = dl = (mlib_d64 *) dst;
if (cmask == 8) {
for (j = 0; j < ysize; j++) {
#pragma pipeloop(0)
for (i = 0; i < xsize / 8; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd);
*dp++ = dd;
}
sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
}
}
else if (cmask == 4) {
for (j = 0; j < ysize; j++) {
#pragma pipeloop(0)
for (i = 0; i < xsize / 8; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd);
*dp++ = dd;
}
sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
}
}
else if (cmask == 2) {
for (j = 0; j < ysize; j++) {
#pragma pipeloop(0)
for (i = 0; i < xsize / 8; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd);
*dp++ = dd;
}
sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
}
}
else {
for (j = 0; j < ysize; j++) {
#pragma pipeloop(0)
for (i = 0; i < xsize / 8; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd);
*dp++ = dd;
}
sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
}
}
}
/***************************************************************/
void mlib_v_ImageChannelExtract_U8_41_D1(const mlib_u8 *src,
mlib_u8 *dst,
mlib_s32 dsize,
mlib_s32 cmask)
{
mlib_u8 *sa, *da;
mlib_u8 *dend, *dend2; /* end points in dst */
mlib_d64 *dp; /* 8-byte aligned start points in dst */
mlib_d64 *sp; /* 8-byte aligned start point in src */
mlib_d64 sd0, sd1, sd2, sd3; /* 8-byte source data */
mlib_d64 sd4, sd5, sd6, sd7;
mlib_d64 sda, sdb, sdc, sdd;
mlib_d64 sde, sdf;
mlib_d64 dd0, dd1;
mlib_s32 soff; /* offset of address in src */
mlib_s32 doff; /* offset of address in dst */
mlib_s32 off; /* offset of src over dst */
mlib_s32 emask; /* edge mask */
mlib_s32 i, n;
sa = (void *)src;
da = dst;
/* prepare the source address */
sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
soff = ((mlib_addr) sa & 7);
/* prepare the destination addresses */
dp = (mlib_d64 *) ((mlib_addr) da & (~7));
doff = ((mlib_addr) da & 7);
dend = da + dsize - 1;
dend2 = dend - 7;
/* calculate the src's offset over dst */
if (cmask == 8) {
off = soff / 4 - doff;
}
else if (cmask == 4) {
off = (soff + 1) / 4 - doff;
}
else if (cmask == 2) {
off = (soff + 2) / 4 - doff;
}
else {
off = (soff + 3) / 4 - doff;
}
if (((cmask == 8) && (soff % 4 == 0)) ||
((cmask == 4) && (soff % 4 == 3)) ||
((cmask == 2) && (soff % 4 == 2)) ||
((cmask == 1) && (soff % 4 == 1))) { /* extract left channel */
if (off == 0) { /* src and dst have same alignment */
/* generate edge mask for the start point */
emask = vis_edge8(da, dend);
/* load 16 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
/* extract, including some garbage at the start point */
CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd0);
/* store 8 bytes result */
vis_pst_8(dd0, dp++, emask);
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd0);
*dp++ = dd0;
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge8(dp, dend);
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd0);
vis_pst_8(dd0, dp++, emask);
}
}
else {
vis_alignaddr((void *)0, off);
/* generate edge mask for the start point */
emask = vis_edge8(da, dend);
if (off < 0) {
/* load 24 bytes */
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_U8_41L(sd4, sd5, sd6, sd7, dd1);
vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask);
}
else {
/* load 48 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_U8_41L(sd0, sd1, sd2, sd3, dd0);
CHANNELEXTRACT_U8_41L(sd4, sd5, sd6, sd7, dd1);
vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
}
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
dd0 = dd1;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
CHANNELEXTRACT_U8_41L(sd4, sd5, sd6, sd7, dd1);
*dp++ = vis_faligndata(dd0, dd1);
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge8(dp, dend);
dd0 = dd1;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
CHANNELEXTRACT_U8_41L(sd4, sd5, sd6, sd7, dd1);
vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
}
}
}
else if (((cmask == 8) && (soff % 4 == 1)) ||
((cmask == 4) && (soff % 4 == 0)) ||
((cmask == 2) && (soff % 4 == 3)) ||
((cmask == 1) && (soff % 4 == 2))) {
/* extract middle left channel */
if (off == 0) { /* src and dst have same alignment */
/* generate edge mask for the start point */
emask = vis_edge8(da, dend);
/* load 16 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
/* extract, including some garbage at the start point */
CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd0);
/* store 8 bytes result */
vis_pst_8(dd0, dp++, emask);
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd0);
*dp++ = dd0;
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge8(dp, dend);
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd0);
vis_pst_8(dd0, dp++, emask);
}
}
else {
vis_alignaddr((void *)0, off);
/* generate edge mask for the start point */
emask = vis_edge8(da, dend);
if (off < 0) {
/* load 24 bytes */
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_U8_41ML(sd4, sd5, sd6, sd7, dd1);
vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask);
}
else {
/* load 48 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_U8_41ML(sd0, sd1, sd2, sd3, dd0);
CHANNELEXTRACT_U8_41ML(sd4, sd5, sd6, sd7, dd1);
vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
}
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
dd0 = dd1;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
CHANNELEXTRACT_U8_41ML(sd4, sd5, sd6, sd7, dd1);
*dp++ = vis_faligndata(dd0, dd1);
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge8(dp, dend);
dd0 = dd1;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
CHANNELEXTRACT_U8_41ML(sd4, sd5, sd6, sd7, dd1);
vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
}
}
}
else if (((cmask == 8) && (soff % 4 == 2)) ||
((cmask == 4) && (soff % 4 == 1)) ||
((cmask == 2) && (soff % 4 == 0)) ||
((cmask == 1) && (soff % 4 == 3))) { /* extract middle right channel */
if (off == 0) { /* src and dst have same alignment */
/* generate edge mask for the start point */
emask = vis_edge8(da, dend);
/* load 16 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
/* extract, including some garbage at the start point */
CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd0);
/* store 8 bytes result */
vis_pst_8(dd0, dp++, emask);
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd0);
*dp++ = dd0;
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge8(dp, dend);
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd0);
vis_pst_8(dd0, dp++, emask);
}
}
else {
vis_alignaddr((void *)0, off);
/* generate edge mask for the start point */
emask = vis_edge8(da, dend);
if (off < 0) {
/* load 24 bytes */
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_U8_41MR(sd4, sd5, sd6, sd7, dd1);
vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask);
}
else {
/* load 48 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_U8_41MR(sd0, sd1, sd2, sd3, dd0);
CHANNELEXTRACT_U8_41MR(sd4, sd5, sd6, sd7, dd1);
vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
}
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
dd0 = dd1;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
CHANNELEXTRACT_U8_41MR(sd4, sd5, sd6, sd7, dd1);
*dp++ = vis_faligndata(dd0, dd1);
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge8(dp, dend);
dd0 = dd1;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
CHANNELEXTRACT_U8_41MR(sd4, sd5, sd6, sd7, dd1);
vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
}
}
}
else { /* extract right channel */
if (off == 0) { /* src and dst have same alignment */
/* generate edge mask for the start point */
emask = vis_edge8(da, dend);
/* load 16 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
/* extract, including some garbage at the start point */
CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd0);
/* store 8 bytes result */
vis_pst_8(dd0, dp++, emask);
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd0);
*dp++ = dd0;
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge8(dp, dend);
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd0);
vis_pst_8(dd0, dp++, emask);
}
}
else {
vis_alignaddr((void *)0, off);
/* generate edge mask for the start point */
emask = vis_edge8(da, dend);
if (off < 0) {
/* load 24 bytes */
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_U8_41R(sd4, sd5, sd6, sd7, dd1);
vis_pst_8(vis_faligndata(dd1, dd1), dp++, emask);
}
else {
/* load 48 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_U8_41R(sd0, sd1, sd2, sd3, dd0);
CHANNELEXTRACT_U8_41R(sd4, sd5, sd6, sd7, dd1);
vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
}
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
dd0 = dd1;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
CHANNELEXTRACT_U8_41R(sd4, sd5, sd6, sd7, dd1);
*dp++ = vis_faligndata(dd0, dd1);
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge8(dp, dend);
dd0 = dd1;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
CHANNELEXTRACT_U8_41R(sd4, sd5, sd6, sd7, dd1);
vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask);
}
}
}
}
/***************************************************************/
void mlib_v_ImageChannelExtract_U8_41(const mlib_u8 *src,
mlib_s32 slb,
mlib_u8 *dst,
mlib_s32 dlb,
mlib_s32 xsize,
mlib_s32 ysize,
mlib_s32 cmask)
{
mlib_u8 *sa, *da;
mlib_u8 *sl, *dl;
mlib_s32 j;
sa = sl = (void *)src;
da = dl = dst;
for (j = 0; j < ysize; j++) {
mlib_v_ImageChannelExtract_U8_41_D1(sa, da, xsize, cmask);
sa = sl += slb;
da = dl += dlb;
}
}
/***************************************************************/
#define CHANNELEXTRACT_S16_21L(sd0, sd1, dd) \
sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd1)); \
sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd1)); \
sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdb)); \
dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
/***************************************************************/
#define CHANNELEXTRACT_S16_21R(sd0, sd1, dd) \
sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd1)); \
sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd1)); \
sdc = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdb)); \
dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
/***************************************************************/
/* extract one channel from a 2-channel image.
* both source and destination image data are 8-byte aligned.
* dsize is multiple of 4.
*/
void mlib_v_ImageChannelExtract_S16_21_A8D1X4(const mlib_s16 *src,
mlib_s16 *dst,
mlib_s32 dsize,
mlib_s32 cmask)
{
mlib_d64 *sp, *dp;
mlib_d64 sd0, sd1;
mlib_d64 sda, sdb, sdc;
mlib_d64 dd;
mlib_s32 i;
sp = (mlib_d64 *) src;
dp = (mlib_d64 *) dst;
if (cmask == 2) {
#pragma pipeloop(0)
for (i = 0; i < dsize / 4; i++) {
sd0 = *sp++;
sd1 = *sp++;
CHANNELEXTRACT_S16_21L(sd0, sd1, dd);
*dp++ = dd;
}
}
else {
#pragma pipeloop(0)
for (i = 0; i < dsize / 4; i++) {
sd0 = *sp++;
sd1 = *sp++;
CHANNELEXTRACT_S16_21R(sd0, sd1, dd);
*dp++ = dd;
}
}
}
/***************************************************************/
void mlib_v_ImageChannelExtract_S16_21_A8D2X4(const mlib_s16 *src,
mlib_s32 slb,
mlib_s16 *dst,
mlib_s32 dlb,
mlib_s32 xsize,
mlib_s32 ysize,
mlib_s32 cmask)
{
mlib_d64 *sp, *dp;
mlib_d64 *sl, *dl;
mlib_d64 sd0, sd1;
mlib_d64 sda, sdb, sdc;
mlib_d64 dd;
mlib_s32 i, j;
sp = sl = (mlib_d64 *) src;
dp = dl = (mlib_d64 *) dst;
if (cmask == 2) {
for (j = 0; j < ysize; j++) {
#pragma pipeloop(0)
for (i = 0; i < xsize / 4; i++) {
sd0 = *sp++;
sd1 = *sp++;
CHANNELEXTRACT_S16_21L(sd0, sd1, dd);
*dp++ = dd;
}
sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
}
}
else {
for (j = 0; j < ysize; j++) {
#pragma pipeloop(0)
for (i = 0; i < xsize / 4; i++) {
sd0 = *sp++;
sd1 = *sp++;
CHANNELEXTRACT_S16_21R(sd0, sd1, dd);
*dp++ = dd;
}
sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
}
}
}
/***************************************************************/
void mlib_v_ImageChannelExtract_S16_21_D1(const mlib_s16 *src,
mlib_s16 *dst,
mlib_s32 dsize,
mlib_s32 cmask)
{
mlib_s16 *sa, *da;
mlib_s16 *dend, *dend2; /* end points in dst */
mlib_d64 *dp; /* 8-byte aligned start points in dst */
mlib_d64 *sp; /* 8-byte aligned start point in src */
mlib_d64 sd0, sd1, sd2, sd3; /* 8-byte source data */
mlib_d64 sda, sdb, sdc;
mlib_d64 dd0, dd1;
mlib_s32 soff; /* offset of address in src */
mlib_s32 doff; /* offset of address in dst */
mlib_s32 off; /* offset of dst over src */
mlib_s32 emask; /* edge mask */
mlib_s32 i, n;
sa = (void *)src;
da = dst;
/* prepare the source address */
sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
soff = ((mlib_addr) sa & 7);
/* prepare the destination addresses */
dp = (mlib_d64 *) ((mlib_addr) da & (~7));
doff = ((mlib_addr) da & 7);
dend = da + dsize - 1;
dend2 = dend - 3;
/* calculate the src's offset over dst */
if (cmask == 2) {
off = (soff / 4) * 2 - doff;
}
else {
off = ((soff + 3) / 4) * 2 - doff;
}
if (((cmask == 2) && (soff % 4 == 0)) || ((cmask == 1) && (soff % 4 != 0))) { /* extract even words */
if (off == 0) { /* src and dst have same alignment */
/* generate edge mask for the start point */
emask = vis_edge16(da, dend);
/* load 16 bytes */
sd0 = *sp++;
sd1 = *sp++;
/* extract, including some garbage at the start point */
CHANNELEXTRACT_S16_21L(sd0, sd1, dd0);
/* store 8 bytes result */
vis_pst_16(dd0, dp++, emask);
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
sd0 = *sp++;
sd1 = *sp++;
CHANNELEXTRACT_S16_21L(sd0, sd1, dd0);
*dp++ = dd0;
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge16(dp, dend);
sd0 = *sp++;
sd1 = *sp++;
CHANNELEXTRACT_S16_21L(sd0, sd1, dd0);
vis_pst_16(dd0, dp++, emask);
}
}
else {
vis_alignaddr((void *)0, off);
/* generate edge mask for the start point */
emask = vis_edge16(da, dend);
if (off < 0) {
/* load 16 bytes */
sd2 = *sp++;
sd3 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_S16_21L(sd2, sd3, dd1);
vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
}
else {
/* load 32 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_S16_21L(sd0, sd1, dd0);
CHANNELEXTRACT_S16_21L(sd2, sd3, dd1);
vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
}
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
dd0 = dd1;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_S16_21L(sd2, sd3, dd1);
*dp++ = vis_faligndata(dd0, dd1);
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge16(dp, dend);
dd0 = dd1;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_S16_21L(sd2, sd3, dd1);
vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
}
}
}
else { /* extract odd words */
if (off == 0) { /* src and dst have same alignment */
/* generate edge mask for the start point */
emask = vis_edge16(da, dend);
/* load 16 bytes, don't care the garbage at the start point */
sd0 = *sp++;
sd1 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_S16_21R(sd0, sd1, dd0);
vis_pst_16(dd0, dp++, emask);
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
sd0 = *sp++;
sd1 = *sp++;
CHANNELEXTRACT_S16_21R(sd0, sd1, dd0);
*dp++ = dd0;
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge16(dp, dend);
sd0 = *sp++;
sd1 = *sp++;
CHANNELEXTRACT_S16_21R(sd0, sd1, dd0);
vis_pst_16(dd0, dp++, emask);
}
}
else {
vis_alignaddr((void *)0, off);
/* generate edge mask for the start point */
emask = vis_edge16(da, dend);
if (off < 0) {
/* load 16 bytes */
sd2 = *sp++;
sd3 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_S16_21R(sd2, sd3, dd1);
vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
}
else {
/* load 32 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_S16_21R(sd0, sd1, dd0);
CHANNELEXTRACT_S16_21R(sd2, sd3, dd1);
vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
}
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
dd0 = dd1;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_S16_21R(sd2, sd3, dd1);
*dp++ = vis_faligndata(dd0, dd1);
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge16(dp, dend);
dd0 = dd1;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_S16_21R(sd2, sd3, dd1);
vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
}
}
}
}
/***************************************************************/
void mlib_v_ImageChannelExtract_S16_21(const mlib_s16 *src,
mlib_s32 slb,
mlib_s16 *dst,
mlib_s32 dlb,
mlib_s32 xsize,
mlib_s32 ysize,
mlib_s32 cmask)
{
mlib_s16 *sa, *da;
mlib_s16 *sl, *dl;
mlib_s32 j;
sa = sl = (void *)src;
da = dl = dst;
for (j = 0; j < ysize; j++) {
mlib_v_ImageChannelExtract_S16_21_D1(sa, da, xsize, cmask);
sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb);
da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
}
}
/***************************************************************/
#define CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd) \
/* extract the left channel */ \
sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); \
sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); \
sdc = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb)); \
dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
/***************************************************************/
#define CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd) \
/* extract the middle channel */ \
sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); \
sdb = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); \
sdc = vis_fpmerge(vis_read_lo(sda), vis_read_hi(sdb)); \
dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
/***************************************************************/
#define CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd) \
/* extract the right channel */ \
sda = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); \
sdb = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); \
sdc = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb)); \
dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
/***************************************************************/
void mlib_v_ImageChannelExtract_S16_31_A8D1X4(const mlib_s16 *src,
mlib_s16 *dst,
mlib_s32 dsize,
mlib_s32 cmask)
{
mlib_d64 *sp, *dp;
mlib_d64 sd0, sd1, sd2;
mlib_d64 sda, sdb, sdc;
mlib_d64 dd;
mlib_s32 i;
sp = (mlib_d64 *) src;
dp = (mlib_d64 *) dst;
if (cmask == 4) {
#pragma pipeloop(0)
for (i = 0; i < dsize / 4; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd);
*dp++ = dd;
}
}
else if (cmask == 2) {
#pragma pipeloop(0)
for (i = 0; i < dsize / 4; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd);
*dp++ = dd;
}
}
else {
#pragma pipeloop(0)
for (i = 0; i < dsize / 4; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd);
*dp++ = dd;
}
}
}
/***************************************************************/
void mlib_v_ImageChannelExtract_S16_31_A8D2X4(const mlib_s16 *src,
mlib_s32 slb,
mlib_s16 *dst,
mlib_s32 dlb,
mlib_s32 xsize,
mlib_s32 ysize,
mlib_s32 cmask)
{
mlib_d64 *sp, *dp;
mlib_d64 *sl, *dl;
mlib_d64 sd0, sd1, sd2;
mlib_d64 sda, sdb, sdc;
mlib_d64 dd;
mlib_s32 i, j;
sp = sl = (mlib_d64 *) src;
dp = dl = (mlib_d64 *) dst;
if (cmask == 4) {
for (j = 0; j < ysize; j++) {
#pragma pipeloop(0)
for (i = 0; i < xsize / 4; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd);
*dp++ = dd;
}
sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
}
}
else if (cmask == 2) {
for (j = 0; j < ysize; j++) {
#pragma pipeloop(0)
for (i = 0; i < xsize / 4; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd);
*dp++ = dd;
}
sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
}
}
else {
for (j = 0; j < ysize; j++) {
#pragma pipeloop(0)
for (i = 0; i < xsize / 4; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd);
*dp++ = dd;
}
sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
}
}
}
/***************************************************************/
void mlib_v_ImageChannelExtract_S16_31_D1(const mlib_s16 *src,
mlib_s16 *dst,
mlib_s32 dsize,
mlib_s32 cmask)
{
mlib_s16 *sa, *da;
mlib_s16 *dend, *dend2; /* end points in dst */
mlib_d64 *dp; /* 8-byte aligned start points in dst */
mlib_d64 *sp; /* 8-byte aligned start point in src */
mlib_d64 sd0, sd1, sd2; /* 8-byte source data */
mlib_d64 sd3, sd4, sd5;
mlib_d64 sda, sdb, sdc;
mlib_d64 dd0, dd1;
mlib_s32 soff; /* offset of address in src */
mlib_s32 doff; /* offset of address in dst */
mlib_s32 off; /* offset of src over dst */
mlib_s32 emask; /* edge mask */
mlib_s32 i, n;
sa = (void *)src;
da = dst;
/* prepare the source address */
sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
soff = ((mlib_addr) sa & 7);
/* prepare the destination addresses */
dp = (mlib_d64 *) ((mlib_addr) da & (~7));
doff = ((mlib_addr) da & 7);
dend = da + dsize - 1;
dend2 = dend - 3;
/* calculate the src's offset over dst */
if (cmask == 4) {
off = (soff / 6) * 2 - doff;
}
else if (cmask == 2) {
off = ((soff + 2) / 6) * 2 - doff;
}
else {
off = ((soff + 4) / 6) * 2 - doff;
}
if (((cmask == 4) && (soff % 6 == 0)) ||
((cmask == 2) && (soff % 6 == 4)) ||
((cmask == 1) && (soff % 6 == 2))) { /* extract left channel */
if (off == 0) { /* src and dst have same alignment */
/* generate edge mask for the start point */
emask = vis_edge16(da, dend);
/* load 16 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
/* extract, including some garbage at the start point */
CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd0);
/* store 8 bytes result */
vis_pst_16(dd0, dp++, emask);
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd0);
*dp++ = dd0;
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge16(dp, dend);
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd0);
vis_pst_16(dd0, dp++, emask);
}
}
else {
vis_alignaddr((void *)0, off);
/* generate edge mask for the start point */
emask = vis_edge16(da, dend);
if (off < 0) {
/* load 24 bytes */
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_S16_31L(sd3, sd4, sd5, dd1);
vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
}
else {
/* load 48 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_S16_31L(sd0, sd1, sd2, dd0);
CHANNELEXTRACT_S16_31L(sd3, sd4, sd5, dd1);
vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
}
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
dd0 = dd1;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
CHANNELEXTRACT_S16_31L(sd3, sd4, sd5, dd1);
*dp++ = vis_faligndata(dd0, dd1);
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge16(dp, dend);
dd0 = dd1;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
CHANNELEXTRACT_S16_31L(sd3, sd4, sd5, dd1);
vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
}
}
}
else if (((cmask == 4) && (soff % 6 == 2)) ||
((cmask == 2) && (soff % 6 == 0)) ||
((cmask == 1) && (soff % 6 == 4))) {
/* extract middle channel */
if (off == 0) { /* src and dst have same alignment */
/* generate edge mask for the start point */
emask = vis_edge16(da, dend);
/* load 16 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
/* extract, including some garbage at the start point */
CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd0);
/* store 8 bytes result */
vis_pst_16(dd0, dp++, emask);
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd0);
*dp++ = dd0;
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge16(dp, dend);
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd0);
vis_pst_16(dd0, dp++, emask);
}
}
else {
vis_alignaddr((void *)0, off);
/* generate edge mask for the start point */
emask = vis_edge16(da, dend);
if (off < 0) {
/* load 24 bytes */
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_S16_31M(sd3, sd4, sd5, dd1);
vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
}
else {
/* load 48 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_S16_31M(sd0, sd1, sd2, dd0);
CHANNELEXTRACT_S16_31M(sd3, sd4, sd5, dd1);
vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
}
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
dd0 = dd1;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
CHANNELEXTRACT_S16_31M(sd3, sd4, sd5, dd1);
*dp++ = vis_faligndata(dd0, dd1);
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge16(dp, dend);
dd0 = dd1;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
CHANNELEXTRACT_S16_31M(sd3, sd4, sd5, dd1);
vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
}
}
}
else { /* extract right channel */
if (off == 0) { /* src and dst have same alignment */
/* generate edge mask for the start point */
emask = vis_edge16(da, dend);
/* load 16 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
/* extract, including some garbage at the start point */
CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd0);
/* store 8 bytes result */
vis_pst_16(dd0, dp++, emask);
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd0);
*dp++ = dd0;
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge16(dp, dend);
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd0);
vis_pst_16(dd0, dp++, emask);
}
}
else {
vis_alignaddr((void *)0, off);
/* generate edge mask for the start point */
emask = vis_edge16(da, dend);
if (off < 0) {
/* load 24 bytes */
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_S16_31R(sd3, sd4, sd5, dd1);
vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
}
else {
/* load 48 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_S16_31R(sd0, sd1, sd2, dd0);
CHANNELEXTRACT_S16_31R(sd3, sd4, sd5, dd1);
vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
}
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
dd0 = dd1;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
CHANNELEXTRACT_S16_31R(sd3, sd4, sd5, dd1);
*dp++ = vis_faligndata(dd0, dd1);
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge16(dp, dend);
dd0 = dd1;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
CHANNELEXTRACT_S16_31R(sd3, sd4, sd5, dd1);
vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
}
}
}
}
/***************************************************************/
void mlib_v_ImageChannelExtract_S16_31(const mlib_s16 *src,
mlib_s32 slb,
mlib_s16 *dst,
mlib_s32 dlb,
mlib_s32 xsize,
mlib_s32 ysize,
mlib_s32 cmask)
{
mlib_s16 *sa, *da;
mlib_s16 *sl, *dl;
mlib_s32 j;
sa = sl = (void *)src;
da = dl = dst;
for (j = 0; j < ysize; j++) {
mlib_v_ImageChannelExtract_S16_31_D1(sa, da, xsize, cmask);
sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb);
da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
}
}
/***************************************************************/
#define CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd) \
/* extract the left channel */ \
sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); \
sdb = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); \
sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdb)); \
dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
/***************************************************************/
#define CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd) \
/* extract the middle left channel */ \
sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); \
sdb = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); \
sdc = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdb)); \
dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
/***************************************************************/
#define CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd) \
/* extract the middle right channel */ \
sda = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); \
sdb = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); \
sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sdb)); \
dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
/***************************************************************/
#define CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd) \
/* extract the right channel */ \
sda = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); \
sdb = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); \
sdc = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sdb)); \
dd = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc))
/***************************************************************/
void mlib_v_ImageChannelExtract_S16_41_A8D1X4(const mlib_s16 *src,
mlib_s16 *dst,
mlib_s32 dsize,
mlib_s32 cmask)
{
mlib_d64 *sp, *dp;
mlib_d64 sd0, sd1, sd2, sd3;
mlib_d64 sda, sdb, sdc;
mlib_d64 dd;
mlib_s32 i;
sp = (mlib_d64 *) src;
dp = (mlib_d64 *) dst;
if (cmask == 8) {
#pragma pipeloop(0)
for (i = 0; i < dsize / 4; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd);
*dp++ = dd;
}
}
else if (cmask == 4) {
#pragma pipeloop(0)
for (i = 0; i < dsize / 4; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd);
*dp++ = dd;
}
}
else if (cmask == 2) {
#pragma pipeloop(0)
for (i = 0; i < dsize / 4; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd);
*dp++ = dd;
}
}
else {
#pragma pipeloop(0)
for (i = 0; i < dsize / 4; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd);
*dp++ = dd;
}
}
}
/***************************************************************/
void mlib_v_ImageChannelExtract_S16_41_A8D2X4(const mlib_s16 *src,
mlib_s32 slb,
mlib_s16 *dst,
mlib_s32 dlb,
mlib_s32 xsize,
mlib_s32 ysize,
mlib_s32 cmask)
{
mlib_d64 *sp, *dp;
mlib_d64 *sl, *dl;
mlib_d64 sd0, sd1, sd2, sd3;
mlib_d64 sda, sdb, sdc;
mlib_d64 dd;
mlib_s32 i, j;
sp = sl = (mlib_d64 *) src;
dp = dl = (mlib_d64 *) dst;
if (cmask == 8) {
for (j = 0; j < ysize; j++) {
#pragma pipeloop(0)
for (i = 0; i < xsize / 4; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd);
*dp++ = dd;
}
sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
}
}
else if (cmask == 4) {
for (j = 0; j < ysize; j++) {
#pragma pipeloop(0)
for (i = 0; i < xsize / 4; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd);
*dp++ = dd;
}
sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
}
}
else if (cmask == 2) {
for (j = 0; j < ysize; j++) {
#pragma pipeloop(0)
for (i = 0; i < xsize / 4; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd);
*dp++ = dd;
}
sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
}
}
else {
for (j = 0; j < ysize; j++) {
#pragma pipeloop(0)
for (i = 0; i < xsize / 4; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd);
*dp++ = dd;
}
sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
}
}
}
/***************************************************************/
void mlib_v_ImageChannelExtract_S16_41_D1(const mlib_s16 *src,
mlib_s16 *dst,
mlib_s32 dsize,
mlib_s32 cmask)
{
mlib_s16 *sa, *da;
mlib_s16 *dend, *dend2; /* end points in dst */
mlib_d64 *dp; /* 8-byte aligned start points in dst */
mlib_d64 *sp; /* 8-byte aligned start point in src */
mlib_d64 sd0, sd1, sd2, sd3; /* 8-byte source data */
mlib_d64 sd4, sd5, sd6, sd7;
mlib_d64 sda, sdb, sdc;
mlib_d64 dd0, dd1;
mlib_s32 soff; /* offset of address in src */
mlib_s32 doff; /* offset of address in dst */
mlib_s32 off; /* offset of src over dst */
mlib_s32 emask; /* edge mask */
mlib_s32 i, n;
sa = (void *)src;
da = dst;
/* prepare the source address */
sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
soff = ((mlib_addr) sa & 7);
/* prepare the destination addresses */
dp = (mlib_d64 *) ((mlib_addr) da & (~7));
doff = ((mlib_addr) da & 7);
dend = da + dsize - 1;
dend2 = dend - 3;
/* calculate the src's offset over dst */
if (cmask == 8) {
off = (soff / 8) * 2 - doff;
}
else if (cmask == 4) {
off = ((soff + 2) / 8) * 2 - doff;
}
else if (cmask == 2) {
off = ((soff + 4) / 8) * 2 - doff;
}
else {
off = ((soff + 6) / 8) * 2 - doff;
}
if (((cmask == 8) && (soff == 0)) ||
((cmask == 4) && (soff == 6)) ||
((cmask == 2) && (soff == 4)) ||
((cmask == 1) && (soff == 2))) { /* extract left channel */
if (off == 0) { /* src and dst have same alignment */
/* generate edge mask for the start point */
emask = vis_edge16(da, dend);
/* load 16 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
/* extract, including some garbage at the start point */
CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd0);
/* store 8 bytes result */
vis_pst_16(dd0, dp++, emask);
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd0);
*dp++ = dd0;
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge16(dp, dend);
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd0);
vis_pst_16(dd0, dp++, emask);
}
}
else {
vis_alignaddr((void *)0, off);
/* generate edge mask for the start point */
emask = vis_edge16(da, dend);
if (off < 0) {
/* load 24 bytes */
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_S16_41L(sd4, sd5, sd6, sd7, dd1);
vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
}
else {
/* load 48 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_S16_41L(sd0, sd1, sd2, sd3, dd0);
CHANNELEXTRACT_S16_41L(sd4, sd5, sd6, sd7, dd1);
vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
}
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
dd0 = dd1;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
CHANNELEXTRACT_S16_41L(sd4, sd5, sd6, sd7, dd1);
*dp++ = vis_faligndata(dd0, dd1);
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge16(dp, dend);
dd0 = dd1;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
CHANNELEXTRACT_S16_41L(sd4, sd5, sd6, sd7, dd1);
vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
}
}
}
else if (((cmask == 8) && (soff == 2)) ||
((cmask == 4) && (soff == 0)) ||
((cmask == 2) && (soff == 6)) ||
((cmask == 1) && (soff == 4))) { /* extract middle left channel */
if (off == 0) { /* src and dst have same alignment */
/* generate edge mask for the start point */
emask = vis_edge16(da, dend);
/* load 16 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
/* extract, including some garbage at the start point */
CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd0);
/* store 8 bytes result */
vis_pst_16(dd0, dp++, emask);
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd0);
*dp++ = dd0;
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge16(dp, dend);
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd0);
vis_pst_16(dd0, dp++, emask);
}
}
else {
vis_alignaddr((void *)0, off);
/* generate edge mask for the start point */
emask = vis_edge16(da, dend);
if (off < 0) {
/* load 24 bytes */
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_S16_41ML(sd4, sd5, sd6, sd7, dd1);
vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
}
else {
/* load 48 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_S16_41ML(sd0, sd1, sd2, sd3, dd0);
CHANNELEXTRACT_S16_41ML(sd4, sd5, sd6, sd7, dd1);
vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
}
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
dd0 = dd1;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
CHANNELEXTRACT_S16_41ML(sd4, sd5, sd6, sd7, dd1);
*dp++ = vis_faligndata(dd0, dd1);
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge16(dp, dend);
dd0 = dd1;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
CHANNELEXTRACT_S16_41ML(sd4, sd5, sd6, sd7, dd1);
vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
}
}
}
else if (((cmask == 8) && (soff == 4)) ||
((cmask == 4) && (soff == 2)) ||
((cmask == 2) && (soff == 0)) ||
((cmask == 1) && (soff == 6))) { /* extract middle right channel */
if (off == 0) { /* src and dst have same alignment */
/* generate edge mask for the start point */
emask = vis_edge16(da, dend);
/* load 16 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
/* extract, including some garbage at the start point */
CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd0);
/* store 8 bytes result */
vis_pst_16(dd0, dp++, emask);
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd0);
*dp++ = dd0;
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge16(dp, dend);
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd0);
vis_pst_16(dd0, dp++, emask);
}
}
else {
vis_alignaddr((void *)0, off);
/* generate edge mask for the start point */
emask = vis_edge16(da, dend);
if (off < 0) {
/* load 24 bytes */
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_S16_41MR(sd4, sd5, sd6, sd7, dd1);
vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
}
else {
/* load 48 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_S16_41MR(sd0, sd1, sd2, sd3, dd0);
CHANNELEXTRACT_S16_41MR(sd4, sd5, sd6, sd7, dd1);
vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
}
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
dd0 = dd1;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
CHANNELEXTRACT_S16_41MR(sd4, sd5, sd6, sd7, dd1);
*dp++ = vis_faligndata(dd0, dd1);
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge16(dp, dend);
dd0 = dd1;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
CHANNELEXTRACT_S16_41MR(sd4, sd5, sd6, sd7, dd1);
vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
}
}
}
else { /* extract right channel */
if (off == 0) { /* src and dst have same alignment */
/* generate edge mask for the start point */
emask = vis_edge16(da, dend);
/* load 16 bytes */
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
/* extract, including some garbage at the start point */
CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd0);
/* store 8 bytes result */
vis_pst_16(dd0, dp++, emask);
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd0);
*dp++ = dd0;
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge16(dp, dend);
sd0 = *sp++;
sd1 = *sp++;
sd2 = *sp++;
sd3 = *sp++;
CHANNELEXTRACT_S16_41R(sd0, sd1, sd2, sd3, dd0);
vis_pst_16(dd0, dp++, emask);
}
}
else {
vis_alignaddr((void *)0, off);
/* generate edge mask for the start point */
emask = vis_edge16(da, dend);
if (off < 0) {
/* load 24 bytes */
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
/* extract and store 8 bytes */
CHANNELEXTRACT_S16_41R(sd4, sd5, sd6, sd7, dd1);
vis_pst_16(vis_faligndata(dd1, dd1), dp++, emask);
}
if ((mlib_addr) dp <= (mlib_addr) dend2) {
n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 8 + 1;
/* 8-pixel column loop, emask not needed */
#pragma pipeloop(0)
for (i = 0; i < n; i++) {
dd0 = dd1;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
CHANNELEXTRACT_S16_41R(sd4, sd5, sd6, sd7, dd1);
*dp++ = vis_faligndata(dd0, dd1);
}
}
/* end point handling */
if ((mlib_addr) dp <= (mlib_addr) dend) {
emask = vis_edge16(dp, dend);
dd0 = dd1;
sd4 = *sp++;
sd5 = *sp++;
sd6 = *sp++;
sd7 = *sp++;
CHANNELEXTRACT_S16_41R(sd4, sd5, sd6, sd7, dd1);
vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask);
}
}
}
}
/***************************************************************/
void mlib_v_ImageChannelExtract_S16_41(const mlib_s16 *src,
mlib_s32 slb,
mlib_s16 *dst,
mlib_s32 dlb,
mlib_s32 xsize,
mlib_s32 ysize,
mlib_s32 cmask)
{
mlib_s16 *sa, *da;
mlib_s16 *sl, *dl;
mlib_s32 j;
sa = sl = (void *)src;
da = dl = dst;
for (j = 0; j < ysize; j++) {
mlib_v_ImageChannelExtract_S16_41_D1(sa, da, xsize, cmask);
sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb);
da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
}
}
/***************************************************************/