0N/A/*
2362N/A * Copyright (c) 2000, 2003, Oracle and/or its affiliates. All rights reserved.
0N/A * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
0N/A *
0N/A * This code is free software; you can redistribute it and/or modify it
0N/A * under the terms of the GNU General Public License version 2 only, as
2362N/A * published by the Free Software Foundation. Oracle designates this
0N/A * particular file as subject to the "Classpath" exception as provided
2362N/A * by Oracle in the LICENSE file that accompanied this code.
0N/A *
0N/A * This code is distributed in the hope that it will be useful, but WITHOUT
0N/A * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
0N/A * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
0N/A * version 2 for more details (a copy is included in the LICENSE file that
0N/A * accompanied this code).
0N/A *
0N/A * You should have received a copy of the GNU General Public License version
0N/A * 2 along with this work; if not, write to the Free Software Foundation,
0N/A * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
0N/A *
2362N/A * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
2362N/A * or visit www.oracle.com if you need additional information or have any
2362N/A * questions.
0N/A */
0N/A
0N/A
0N/A
0N/A/*
0N/A * FUNCTION
0N/A * Internal functions for mlib_ImageConv* on U8 type
0N/A * and MLIB_EDGE_DST_NO_WRITE mask
0N/A *
0N/A */
0N/A
0N/A/***************************************************************/
0N/A
0N/A#include <vis_proto.h>
0N/A#include <mlib_image.h>
0N/A#include <mlib_ImageCheck.h>
0N/A#include <mlib_ImageColormap.h>
0N/A
0N/A/*
0N/A This defines switches between functions in
0N/A files: mlib_v_ImageConv_8nw.c,
0N/A mlib_v_ImageConvIndex3_8_16nw.c,
0N/A mlib_v_ImageConvIndex4_8_16nw.c,
0N/A mlib_v_ImageConvIndex3_8_16nw.c,
0N/A mlib_v_ImageConvIndex4_8_16nw.c
0N/A*/
0N/A
0N/A#define CONV_INDEX
0N/A
0N/A#define DTYPE mlib_s16
0N/A#define LTYPE mlib_u8
0N/A
0N/A/***************************************************************/
0N/A
0N/A#ifdef CONV_INDEX
0N/A
0N/A#define CONV_FUNC(KERN) \
0N/A mlib_conv##KERN##_Index3_8_16nw(mlib_image *dst, \
0N/A mlib_image *src, \
0N/A mlib_s32 *kern, \
0N/A mlib_s32 scale, \
0N/A void *colormap)
0N/A
0N/A#else
0N/A
0N/A#define CONV_FUNC(KERN) \
0N/A mlib_conv##KERN##_8nw_f(mlib_image *dst, \
0N/A mlib_image *src, \
0N/A mlib_s32 *kern, \
0N/A mlib_s32 scale)
0N/A
0N/A#endif
0N/A
0N/A/***************************************************************/
0N/A
0N/A#ifdef CONV_INDEX
0N/A
0N/A#define NCHAN 3
0N/A
0N/A#else
0N/A
0N/A#define NCHAN nchan
0N/A
0N/A#endif
0N/A
0N/A/***************************************************************/
0N/A
0N/A#define DEF_VARS \
0N/A DTYPE *sl, *sp, *dl; \
0N/A mlib_s32 hgt = mlib_ImageGetHeight(src); \
0N/A mlib_s32 wid = mlib_ImageGetWidth(src); \
0N/A mlib_s32 sll = mlib_ImageGetStride(src) / sizeof(DTYPE); \
0N/A mlib_s32 dll = mlib_ImageGetStride(dst) / sizeof(DTYPE); \
0N/A DTYPE *adr_src = (DTYPE *)mlib_ImageGetData(src); \
0N/A DTYPE *adr_dst = (DTYPE *)mlib_ImageGetData(dst); \
0N/A mlib_s32 ssize, xsize, dsize, esize, emask, buff_ind = 0; \
0N/A mlib_d64 *pbuff, *dp; \
0N/A mlib_f32 *karr = (mlib_f32 *)kern; \
0N/A mlib_s32 gsr_scale = (31 - scale) << 3; \
0N/A mlib_d64 drnd = vis_to_double_dup(mlib_round_8[31 - scale]); \
0N/A mlib_s32 i, j, l
0N/A
0N/A/***************************************************************/
0N/A
0N/A#ifdef CONV_INDEX
0N/A
0N/A#define DEF_EXTRA_VARS \
0N/A int offset = mlib_ImageGetLutOffset(colormap); \
0N/A LTYPE **lut_table = (LTYPE**)mlib_ImageGetLutData(colormap); \
0N/A LTYPE *ltbl0 = lut_table[0] - offset; \
0N/A LTYPE *ltbl1 = lut_table[1] - offset; \
0N/A LTYPE *ltbl2 = lut_table[2] - offset; \
0N/A LTYPE *ltbl3 = (NCHAN > 3) ? lut_table[3] - offset : ltbl2
0N/A
0N/A#else
0N/A
0N/A#define DEF_EXTRA_VARS \
0N/A mlib_s32 nchan = mlib_ImageGetChannels(dst)
0N/A
0N/A#endif
0N/A
0N/A/***************************************************************/
0N/A
0N/A#if NCHAN == 3
0N/A
0N/A#define LOAD_SRC() { \
0N/A mlib_s32 s0 = sp[0], s1 = sp[1], s2 = sp[2], s3 = sp[3]; \
0N/A mlib_s32 s4 = sp[4], s5 = sp[5], s6 = sp[6], s7 = sp[7]; \
0N/A mlib_d64 t0, t1, t2; \
0N/A \
0N/A t2 = vis_faligndata(vis_ld_u8_i(ltbl2, s7), t2); \
0N/A t2 = vis_faligndata(vis_ld_u8_i(ltbl1, s7), t2); \
0N/A t2 = vis_faligndata(vis_ld_u8_i(ltbl0, s7), t2); \
0N/A t2 = vis_faligndata(vis_ld_u8_i(ltbl2, s6), t2); \
0N/A t2 = vis_faligndata(vis_ld_u8_i(ltbl1, s6), t2); \
0N/A t2 = vis_faligndata(vis_ld_u8_i(ltbl0, s6), t2); \
0N/A t2 = vis_faligndata(vis_ld_u8_i(ltbl2, s5), t2); \
0N/A t2 = vis_faligndata(vis_ld_u8_i(ltbl1, s5), t2); \
0N/A t1 = vis_faligndata(vis_ld_u8_i(ltbl0, s5), t1); \
0N/A t1 = vis_faligndata(vis_ld_u8_i(ltbl2, s4), t1); \
0N/A t1 = vis_faligndata(vis_ld_u8_i(ltbl1, s4), t1); \
0N/A t1 = vis_faligndata(vis_ld_u8_i(ltbl0, s4), t1); \
0N/A t1 = vis_faligndata(vis_ld_u8_i(ltbl2, s3), t1); \
0N/A t1 = vis_faligndata(vis_ld_u8_i(ltbl1, s3), t1); \
0N/A t1 = vis_faligndata(vis_ld_u8_i(ltbl0, s3), t1); \
0N/A t1 = vis_faligndata(vis_ld_u8_i(ltbl2, s2), t1); \
0N/A t0 = vis_faligndata(vis_ld_u8_i(ltbl1, s2), t0); \
0N/A t0 = vis_faligndata(vis_ld_u8_i(ltbl0, s2), t0); \
0N/A t0 = vis_faligndata(vis_ld_u8_i(ltbl2, s1), t0); \
0N/A t0 = vis_faligndata(vis_ld_u8_i(ltbl1, s1), t0); \
0N/A t0 = vis_faligndata(vis_ld_u8_i(ltbl0, s1), t0); \
0N/A t0 = vis_faligndata(vis_ld_u8_i(ltbl2, s0), t0); \
0N/A t0 = vis_faligndata(vis_ld_u8_i(ltbl1, s0), t0); \
0N/A t0 = vis_faligndata(vis_ld_u8_i(ltbl0, s0), t0); \
0N/A \
0N/A buffn[i] = t0; \
0N/A buffn[i + 1] = t1; \
0N/A buffn[i + 2] = t2; \
0N/A \
0N/A sp += 8; \
0N/A }
0N/A
0N/A#else
0N/A
0N/A#define LOAD_SRC() { \
0N/A mlib_s32 s0 = sp[0], s1 = sp[1], s2 = sp[2], s3 = sp[3]; \
0N/A mlib_s32 s4 = sp[4], s5 = sp[5], s6 = sp[6], s7 = sp[7]; \
0N/A mlib_d64 t0, t1, t2; \
0N/A \
0N/A t2 = vis_faligndata(vis_ld_u8_i(ltbl3, s5), t2); \
0N/A t2 = vis_faligndata(vis_ld_u8_i(ltbl2, s5), t2); \
0N/A t2 = vis_faligndata(vis_ld_u8_i(ltbl1, s5), t2); \
0N/A t2 = vis_faligndata(vis_ld_u8_i(ltbl0, s5), t2); \
0N/A t2 = vis_faligndata(vis_ld_u8_i(ltbl3, s4), t2); \
0N/A t2 = vis_faligndata(vis_ld_u8_i(ltbl2, s4), t2); \
0N/A t2 = vis_faligndata(vis_ld_u8_i(ltbl1, s4), t2); \
0N/A t2 = vis_faligndata(vis_ld_u8_i(ltbl0, s4), t2); \
0N/A t1 = vis_faligndata(vis_ld_u8_i(ltbl3, s3), t1); \
0N/A t1 = vis_faligndata(vis_ld_u8_i(ltbl2, s3), t1); \
0N/A t1 = vis_faligndata(vis_ld_u8_i(ltbl1, s3), t1); \
0N/A t1 = vis_faligndata(vis_ld_u8_i(ltbl0, s3), t1); \
0N/A t1 = vis_faligndata(vis_ld_u8_i(ltbl3, s2), t1); \
0N/A t1 = vis_faligndata(vis_ld_u8_i(ltbl2, s2), t1); \
0N/A t1 = vis_faligndata(vis_ld_u8_i(ltbl1, s2), t1); \
0N/A t1 = vis_faligndata(vis_ld_u8_i(ltbl0, s2), t1); \
0N/A t0 = vis_faligndata(vis_ld_u8_i(ltbl3, s1), t0); \
0N/A t0 = vis_faligndata(vis_ld_u8_i(ltbl2, s1), t0); \
0N/A t0 = vis_faligndata(vis_ld_u8_i(ltbl1, s1), t0); \
0N/A t0 = vis_faligndata(vis_ld_u8_i(ltbl0, s1), t0); \
0N/A t0 = vis_faligndata(vis_ld_u8_i(ltbl3, s0), t0); \
0N/A t0 = vis_faligndata(vis_ld_u8_i(ltbl2, s0), t0); \
0N/A t0 = vis_faligndata(vis_ld_u8_i(ltbl1, s0), t0); \
0N/A t0 = vis_faligndata(vis_ld_u8_i(ltbl0, s0), t0); \
0N/A \
0N/A buffn[i] = t0; \
0N/A buffn[i + 1] = t1; \
0N/A buffn[i + 2] = t2; \
0N/A \
0N/A sp += 6; \
0N/A }
0N/A
0N/A#endif
0N/A
0N/A/***************************************************************/
0N/A
0N/Astatic mlib_s32 mlib_round_8[16] = { 0x00400040, 0x00200020, 0x00100010, 0x00080008,
0N/A 0x00040004, 0x00020002, 0x00010001, 0x00000000,
0N/A 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0N/A 0x00000000, 0x00000000, 0x00000000, 0x00000000 };
0N/A
0N/A/***************************************************************/
0N/A
0N/Avoid mlib_ImageCopy_na(mlib_u8 *sa, mlib_u8 *da, int size);
0N/A
0N/A/***************************************************************/
0N/A
0N/A#define KSIZE 2
0N/A
0N/Amlib_status CONV_FUNC(2x2)
0N/A{
0N/A mlib_d64 *buffs[2*(KSIZE + 1)];
0N/A mlib_d64 *buff0, *buff1, *buffn, *buffd, *buffe;
0N/A mlib_d64 s00, s01, s10, s11, s0, s1;
0N/A mlib_d64 d0, d1, d00, d01, d10, d11;
0N/A DEF_VARS;
0N/A DEF_EXTRA_VARS;
0N/A
0N/A sl = adr_src;
0N/A dl = adr_dst;
0N/A
0N/A ssize = NCHAN*wid;
0N/A dsize = (ssize + 7)/8;
0N/A esize = dsize + 4;
0N/A pbuff = mlib_malloc((KSIZE + 4)*esize*sizeof(mlib_d64));
0N/A if (pbuff == NULL) return MLIB_FAILURE;
0N/A
0N/A for (i = 0; i < (KSIZE + 1); i++) buffs[i] = pbuff + i*esize;
0N/A for (i = 0; i < (KSIZE + 1); i++) buffs[(KSIZE + 1) + i] = buffs[i];
0N/A buffd = buffs[KSIZE] + esize;
0N/A buffe = buffd + 2*esize;
0N/A
0N/A wid -= (KSIZE - 1);
0N/A hgt -= (KSIZE - 1);
0N/A xsize = ssize - NCHAN*(KSIZE - 1);
0N/A emask = (0xFF00 >> (xsize & 7)) & 0xFF;
0N/A
0N/A vis_write_gsr(gsr_scale + 7);
0N/A
0N/A for (l = 0; l < KSIZE; l++) {
0N/A mlib_d64 *buffn = buffs[l];
0N/A sp = sl + l*sll;
0N/A
0N/A#ifndef CONV_INDEX
0N/A if ((mlib_addr)sp & 7) mlib_ImageCopy_na((void*)sp, (void*)buffn, ssize);
0N/A
0N/A#else
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < dsize; i += 3) {
0N/A LOAD_SRC();
0N/A }
0N/A#endif /* CONV_INDEX */
0N/A }
0N/A
0N/A for (j = 0; j < hgt; j++) {
0N/A mlib_d64 **buffc = buffs + buff_ind;
0N/A mlib_f32 *pk = karr, k0, k1;
0N/A sp = sl + KSIZE*sll;
0N/A
0N/A buff0 = buffc[0];
0N/A buff1 = buffc[1];
0N/A buffn = buffc[KSIZE];
0N/A
0N/A#ifndef CONV_INDEX
0N/A if ((((mlib_addr)(sl )) & 7) == 0) buff0 = (mlib_d64*)sl;
0N/A if ((((mlib_addr)(sl + sll)) & 7) == 0) buff1 = (mlib_d64*)(sl + sll);
0N/A if ((mlib_addr)sp & 7) mlib_ImageCopy_na((void*)sp, (void*)buffn, ssize);
0N/A#endif
0N/A
0N/A k0 = pk[1];
0N/A k1 = pk[3];
0N/A vis_write_gsr(gsr_scale + NCHAN);
0N/A
0N/A s01 = buff0[0];
0N/A s11 = buff1[0];
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < (xsize + 7)/8; i++) {
0N/A s00 = s01;
0N/A s10 = s11;
0N/A s01 = buff0[i + 1];
0N/A s11 = buff1[i + 1];
0N/A s0 = vis_faligndata(s00, s01);
0N/A s1 = vis_faligndata(s10, s11);
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
0N/A d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
0N/A d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
0N/A d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
0N/A
0N/A d0 = vis_fpadd16(d00, d10);
0N/A d1 = vis_fpadd16(d01, d11);
0N/A buffd[2*i] = d0;
0N/A buffd[2*i + 1] = d1;
0N/A }
0N/A
0N/A k0 = pk[0];
0N/A k1 = pk[2];
0N/A#ifndef CONV_INDEX
0N/A dp = ((mlib_addr)dl & 7) ? buffe : (mlib_d64*)dl;
0N/A
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < xsize/8; i++) {
0N/A s0 = buff0[i];
0N/A s1 = buff1[i];
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
0N/A d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
0N/A d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
0N/A d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
0N/A
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A d00 = vis_fpadd16(d00, d10);
0N/A d0 = vis_fpadd16(d0, drnd);
0N/A d0 = vis_fpadd16(d0, d00);
0N/A d01 = vis_fpadd16(d01, d11);
0N/A d1 = vis_fpadd16(d1, drnd);
0N/A d1 = vis_fpadd16(d1, d01);
0N/A dp[i] = vis_fpack16_pair(d0, d1);
0N/A }
0N/A
0N/A if (emask) {
0N/A s0 = buff0[i];
0N/A s1 = buff1[i];
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
0N/A d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
0N/A d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
0N/A d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
0N/A
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A d00 = vis_fpadd16(d00, d10);
0N/A d0 = vis_fpadd16(d0, drnd);
0N/A d0 = vis_fpadd16(d0, d00);
0N/A d01 = vis_fpadd16(d01, d11);
0N/A d1 = vis_fpadd16(d1, drnd);
0N/A d1 = vis_fpadd16(d1, d01);
0N/A
0N/A d0 = vis_fpack16_pair(d0, d1);
0N/A vis_pst_8(d0, dp + i, emask);
0N/A }
0N/A
0N/A if ((mlib_u8*)dp != dl) mlib_ImageCopy_na((void*)buffe, dl, xsize);
0N/A
0N/A#else
0N/A vis_write_gsr(gsr_scale + 7);
0N/A
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < dsize; i += 3) {
0N/A mlib_d64 d00, d01, d02, d03, d04, d05;
0N/A mlib_d64 d10, d11, d12, d13, d14, d15;
0N/A mlib_d64 d0, d1, d2, d3, d4, d5;
0N/A mlib_d64 s00 = buff0[i];
0N/A mlib_d64 s01 = buff0[i + 1];
0N/A mlib_d64 s02 = buff0[i + 2];
0N/A mlib_d64 s10 = buff1[i];
0N/A mlib_d64 s11 = buff1[i + 1];
0N/A mlib_d64 s12 = buff1[i + 2];
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_hi(s00), k0);
0N/A d01 = vis_fmul8x16au(vis_read_lo(s00), k0);
0N/A d02 = vis_fmul8x16au(vis_read_hi(s01), k0);
0N/A d03 = vis_fmul8x16au(vis_read_lo(s01), k0);
0N/A d04 = vis_fmul8x16au(vis_read_hi(s02), k0);
0N/A d05 = vis_fmul8x16au(vis_read_lo(s02), k0);
0N/A d10 = vis_fmul8x16au(vis_read_hi(s10), k1);
0N/A d11 = vis_fmul8x16au(vis_read_lo(s10), k1);
0N/A d12 = vis_fmul8x16au(vis_read_hi(s11), k1);
0N/A d13 = vis_fmul8x16au(vis_read_lo(s11), k1);
0N/A d14 = vis_fmul8x16au(vis_read_hi(s12), k1);
0N/A d15 = vis_fmul8x16au(vis_read_lo(s12), k1);
0N/A
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A d2 = buffd[2*i + 2];
0N/A d3 = buffd[2*i + 3];
0N/A d4 = buffd[2*i + 4];
0N/A d5 = buffd[2*i + 5];
0N/A d00 = vis_fpadd16(d00, d10);
0N/A d0 = vis_fpadd16(d0, drnd);
0N/A d0 = vis_fpadd16(d0, d00);
0N/A d01 = vis_fpadd16(d01, d11);
0N/A d1 = vis_fpadd16(d1, drnd);
0N/A d1 = vis_fpadd16(d1, d01);
0N/A d02 = vis_fpadd16(d02, d12);
0N/A d2 = vis_fpadd16(d2, drnd);
0N/A d2 = vis_fpadd16(d2, d02);
0N/A d03 = vis_fpadd16(d03, d13);
0N/A d3 = vis_fpadd16(d3, drnd);
0N/A d3 = vis_fpadd16(d3, d03);
0N/A d04 = vis_fpadd16(d04, d14);
0N/A d4 = vis_fpadd16(d4, drnd);
0N/A d4 = vis_fpadd16(d4, d04);
0N/A d05 = vis_fpadd16(d05, d15);
0N/A d5 = vis_fpadd16(d5, drnd);
0N/A d5 = vis_fpadd16(d5, d05);
0N/A
0N/A buffe[i ] = vis_fpack16_pair(d0, d1);
0N/A buffe[i + 1] = vis_fpack16_pair(d2, d3);
0N/A buffe[i + 2] = vis_fpack16_pair(d4, d5);
0N/A
0N/A LOAD_SRC();
0N/A }
0N/A
0N/A mlib_ImageColorTrue2IndexLine_U8_S16_3((void*)buffe, dl, wid, colormap);
0N/A#endif /* CONV_INDEX */
0N/A
0N/A sl += sll;
0N/A dl += dll;
0N/A
0N/A buff_ind++;
0N/A if (buff_ind >= (KSIZE + 1)) buff_ind = 0;
0N/A }
0N/A
0N/A mlib_free(pbuff);
0N/A
0N/A return MLIB_SUCCESS;
0N/A}
0N/A
0N/A/***************************************************************/
0N/A
0N/A#undef KSIZE
0N/A#define KSIZE 3
0N/A
0N/Amlib_status CONV_FUNC(3x3)
0N/A{
0N/A mlib_d64 *buffs[2*(KSIZE + 1)];
0N/A mlib_d64 *buff0, *buff1, *buff2, *buffn, *buffd, *buffe;
0N/A mlib_d64 s00, s01, s10, s11, s20, s21, s0, s1, s2;
0N/A mlib_d64 dd, d0, d1, d00, d01, d10, d11, d20, d21;
0N/A mlib_s32 ik, ik_last, off, doff;
0N/A DEF_VARS;
0N/A DEF_EXTRA_VARS;
0N/A
0N/A sl = adr_src;
0N/A#ifdef CONV_INDEX
0N/A dl = adr_dst + ((KSIZE - 1)/2)*(dll + 1);
0N/A#else
0N/A dl = adr_dst + ((KSIZE - 1)/2)*(dll + NCHAN);
0N/A#endif
0N/A
0N/A ssize = NCHAN*wid;
0N/A dsize = (ssize + 7)/8;
0N/A esize = dsize + 4;
0N/A pbuff = mlib_malloc((KSIZE + 4)*esize*sizeof(mlib_d64));
0N/A if (pbuff == NULL) return MLIB_FAILURE;
0N/A
0N/A for (i = 0; i < (KSIZE + 1); i++) buffs[i] = pbuff + i*esize;
0N/A for (i = 0; i < (KSIZE + 1); i++) buffs[(KSIZE + 1) + i] = buffs[i];
0N/A buffd = buffs[KSIZE] + esize;
0N/A buffe = buffd + 2*esize;
0N/A
0N/A wid -= (KSIZE - 1);
0N/A hgt -= (KSIZE - 1);
0N/A xsize = ssize - NCHAN*(KSIZE - 1);
0N/A emask = (0xFF00 >> (xsize & 7)) & 0xFF;
0N/A
0N/A vis_write_gsr(gsr_scale + 7);
0N/A
0N/A for (l = 0; l < KSIZE; l++) {
0N/A mlib_d64 *buffn = buffs[l];
0N/A sp = sl + l*sll;
0N/A
0N/A#ifndef CONV_INDEX
0N/A if ((mlib_addr)sp & 7) mlib_ImageCopy_na((void*)sp, (void*)buffn, ssize);
0N/A#else
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < dsize; i += 3) {
0N/A LOAD_SRC();
0N/A }
0N/A#endif /* CONV_INDEX */
0N/A }
0N/A
0N/A /* init buffer */
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < (xsize + 7)/8; i++) {
0N/A buffd[2*i ] = drnd;
0N/A buffd[2*i + 1] = drnd;
0N/A }
0N/A
0N/A for (j = 0; j < hgt; j++) {
0N/A mlib_d64 **buffc = buffs + buff_ind, *pbuff0, *pbuff1, *pbuff2;
0N/A mlib_f32 *pk = karr, k0, k1, k2;
0N/A sp = sl + KSIZE*sll;
0N/A
0N/A pbuff0 = buffc[0];
0N/A pbuff1 = buffc[1];
0N/A pbuff2 = buffc[2];
0N/A buffn = buffc[KSIZE];
0N/A
0N/A#ifndef CONV_INDEX
0N/A if ((((mlib_addr)(sl )) & 7) == 0) pbuff0 = (mlib_d64*)sl;
0N/A if ((((mlib_addr)(sl + sll)) & 7) == 0) pbuff1 = (mlib_d64*)(sl + sll);
0N/A if ((((mlib_addr)(sl + 2*sll)) & 7) == 0) pbuff2 = (mlib_d64*)(sl + 2*sll);
0N/A
0N/A if ((mlib_addr)sp & 7) mlib_ImageCopy_na((void*)sp, (void*)buffn, ssize);
0N/A#endif
0N/A
0N/A#ifdef CONV_INDEX
0N/A ik_last = 0;
0N/A#else
0N/A ik_last = (KSIZE - 1);
0N/A#endif
0N/A
0N/A for (ik = 0; ik < KSIZE; ik++) {
0N/A k0 = pk[ik];
0N/A k1 = pk[ik + KSIZE];
0N/A k2 = pk[ik + 2*KSIZE];
0N/A
0N/A off = ik*NCHAN;
0N/A doff = off/8;
0N/A off &= 7;
0N/A buff0 = pbuff0 + doff;
0N/A buff1 = pbuff1 + doff;
0N/A buff2 = pbuff2 + doff;
0N/A vis_write_gsr(gsr_scale + off);
0N/A
0N/A if (ik == ik_last) continue;
0N/A /*if (!ik_last) {
0N/A if ((off & 3) || (ik == (KSIZE - 1))) {
0N/A ik_last = ik;
0N/A continue;
0N/A }
0N/A }*/
0N/A
0N/A if (off == 0) {
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < (xsize + 7)/8; i++) {
0N/A s0 = buff0[i];
0N/A s1 = buff1[i];
0N/A s2 = buff2[i];
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
0N/A d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
0N/A d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
0N/A d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
0N/A d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
0N/A d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
0N/A
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A d0 = vis_fpadd16(d00, d0);
0N/A d0 = vis_fpadd16(d10, d0);
0N/A d0 = vis_fpadd16(d20, d0);
0N/A d1 = vis_fpadd16(d01, d1);
0N/A d1 = vis_fpadd16(d11, d1);
0N/A d1 = vis_fpadd16(d21, d1);
0N/A buffd[2*i] = d0;
0N/A buffd[2*i + 1] = d1;
0N/A }
0N/A
0N/A } else if (off == 4) {
0N/A s01 = buff0[0];
0N/A s11 = buff1[0];
0N/A s21 = buff2[0];
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < (xsize + 7)/8; i++) {
0N/A s00 = s01;
0N/A s10 = s11;
0N/A s20 = s21;
0N/A s01 = buff0[i + 1];
0N/A s11 = buff1[i + 1];
0N/A s21 = buff2[i + 1];
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
0N/A d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
0N/A d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
0N/A d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
0N/A d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
0N/A d21 = vis_fmul8x16au(vis_read_hi(s21), k2);
0N/A
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A d0 = vis_fpadd16(d00, d0);
0N/A d0 = vis_fpadd16(d10, d0);
0N/A d0 = vis_fpadd16(d20, d0);
0N/A d1 = vis_fpadd16(d01, d1);
0N/A d1 = vis_fpadd16(d11, d1);
0N/A d1 = vis_fpadd16(d21, d1);
0N/A buffd[2*i] = d0;
0N/A buffd[2*i + 1] = d1;
0N/A }
0N/A
0N/A } else {
0N/A s01 = buff0[0];
0N/A s11 = buff1[0];
0N/A s21 = buff2[0];
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < (xsize + 7)/8; i++) {
0N/A s00 = s01;
0N/A s10 = s11;
0N/A s20 = s21;
0N/A s01 = buff0[i + 1];
0N/A s11 = buff1[i + 1];
0N/A s21 = buff2[i + 1];
0N/A s0 = vis_faligndata(s00, s01);
0N/A s1 = vis_faligndata(s10, s11);
0N/A s2 = vis_faligndata(s20, s21);
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
0N/A d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
0N/A d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
0N/A d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
0N/A d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
0N/A d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
0N/A
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A d0 = vis_fpadd16(d00, d0);
0N/A d0 = vis_fpadd16(d10, d0);
0N/A d0 = vis_fpadd16(d20, d0);
0N/A d1 = vis_fpadd16(d01, d1);
0N/A d1 = vis_fpadd16(d11, d1);
0N/A d1 = vis_fpadd16(d21, d1);
0N/A buffd[2*i] = d0;
0N/A buffd[2*i + 1] = d1;
0N/A }
0N/A }
0N/A }
0N/A
0N/A k0 = pk[ik_last];
0N/A k1 = pk[ik_last + KSIZE];
0N/A k2 = pk[ik_last + 2*KSIZE];
0N/A
0N/A off = ik_last*NCHAN;
0N/A doff = off/8;
0N/A off &= 7;
0N/A buff0 = pbuff0 + doff;
0N/A buff1 = pbuff1 + doff;
0N/A buff2 = pbuff2 + doff;
0N/A vis_write_gsr(gsr_scale + off);
0N/A
0N/A#ifndef CONV_INDEX
0N/A dp = ((mlib_addr)dl & 7) ? buffe : (mlib_d64*)dl;
0N/A
0N/A s01 = buff0[0];
0N/A s11 = buff1[0];
0N/A s21 = buff2[0];
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < xsize/8; i++) {
0N/A s00 = s01;
0N/A s10 = s11;
0N/A s20 = s21;
0N/A s01 = buff0[i + 1];
0N/A s11 = buff1[i + 1];
0N/A s21 = buff2[i + 1];
0N/A s0 = vis_faligndata(s00, s01);
0N/A s1 = vis_faligndata(s10, s11);
0N/A s2 = vis_faligndata(s20, s21);
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
0N/A d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
0N/A d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
0N/A d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
0N/A d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
0N/A d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
0N/A
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A d0 = vis_fpadd16(d0, d00);
0N/A d0 = vis_fpadd16(d0, d10);
0N/A d0 = vis_fpadd16(d0, d20);
0N/A d1 = vis_fpadd16(d1, d01);
0N/A d1 = vis_fpadd16(d1, d11);
0N/A d1 = vis_fpadd16(d1, d21);
0N/A
0N/A dd = vis_fpack16_pair(d0, d1);
0N/A dp[i] = dd;
0N/A
0N/A buffd[2*i ] = drnd;
0N/A buffd[2*i + 1] = drnd;
0N/A }
0N/A
0N/A if (emask) {
0N/A s00 = s01;
0N/A s10 = s11;
0N/A s20 = s21;
0N/A s01 = buff0[i + 1];
0N/A s11 = buff1[i + 1];
0N/A s21 = buff2[i + 1];
0N/A s0 = vis_faligndata(s00, s01);
0N/A s1 = vis_faligndata(s10, s11);
0N/A s2 = vis_faligndata(s20, s21);
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
0N/A d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
0N/A d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
0N/A d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
0N/A d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
0N/A d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
0N/A
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A d0 = vis_fpadd16(d0, d00);
0N/A d0 = vis_fpadd16(d0, d10);
0N/A d0 = vis_fpadd16(d0, d20);
0N/A d1 = vis_fpadd16(d1, d01);
0N/A d1 = vis_fpadd16(d1, d11);
0N/A d1 = vis_fpadd16(d1, d21);
0N/A
0N/A dd = vis_fpack16_pair(d0, d1);
0N/A vis_pst_8(dd, dp + i, emask);
0N/A
0N/A buffd[2*i ] = drnd;
0N/A buffd[2*i + 1] = drnd;
0N/A }
0N/A
0N/A if ((mlib_u8*)dp != dl) mlib_ImageCopy_na((void*)buffe, dl, xsize);
0N/A
0N/A#else
0N/A vis_write_gsr(gsr_scale + 7);
0N/A
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < dsize; i += 3) {
0N/A mlib_d64 d00, d01, d02, d03, d04, d05;
0N/A mlib_d64 d10, d11, d12, d13, d14, d15;
0N/A mlib_d64 d20, d21, d22, d23, d24, d25;
0N/A mlib_d64 d0, d1, d2, d3, d4, d5;
0N/A mlib_d64 s00 = buff0[i];
0N/A mlib_d64 s01 = buff0[i + 1];
0N/A mlib_d64 s02 = buff0[i + 2];
0N/A mlib_d64 s10 = buff1[i];
0N/A mlib_d64 s11 = buff1[i + 1];
0N/A mlib_d64 s12 = buff1[i + 2];
0N/A mlib_d64 s20 = buff2[i];
0N/A mlib_d64 s21 = buff2[i + 1];
0N/A mlib_d64 s22 = buff2[i + 2];
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_hi(s00), k0);
0N/A d01 = vis_fmul8x16au(vis_read_lo(s00), k0);
0N/A d02 = vis_fmul8x16au(vis_read_hi(s01), k0);
0N/A d03 = vis_fmul8x16au(vis_read_lo(s01), k0);
0N/A d04 = vis_fmul8x16au(vis_read_hi(s02), k0);
0N/A d05 = vis_fmul8x16au(vis_read_lo(s02), k0);
0N/A d10 = vis_fmul8x16au(vis_read_hi(s10), k1);
0N/A d11 = vis_fmul8x16au(vis_read_lo(s10), k1);
0N/A d12 = vis_fmul8x16au(vis_read_hi(s11), k1);
0N/A d13 = vis_fmul8x16au(vis_read_lo(s11), k1);
0N/A d14 = vis_fmul8x16au(vis_read_hi(s12), k1);
0N/A d15 = vis_fmul8x16au(vis_read_lo(s12), k1);
0N/A d20 = vis_fmul8x16au(vis_read_hi(s20), k2);
0N/A d21 = vis_fmul8x16au(vis_read_lo(s20), k2);
0N/A d22 = vis_fmul8x16au(vis_read_hi(s21), k2);
0N/A d23 = vis_fmul8x16au(vis_read_lo(s21), k2);
0N/A d24 = vis_fmul8x16au(vis_read_hi(s22), k2);
0N/A d25 = vis_fmul8x16au(vis_read_lo(s22), k2);
0N/A
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A d2 = buffd[2*i + 2];
0N/A d3 = buffd[2*i + 3];
0N/A d4 = buffd[2*i + 4];
0N/A d5 = buffd[2*i + 5];
0N/A d0 = vis_fpadd16(d0, d00);
0N/A d0 = vis_fpadd16(d0, d10);
0N/A d0 = vis_fpadd16(d0, d20);
0N/A d1 = vis_fpadd16(d1, d01);
0N/A d1 = vis_fpadd16(d1, d11);
0N/A d1 = vis_fpadd16(d1, d21);
0N/A d2 = vis_fpadd16(d2, d02);
0N/A d2 = vis_fpadd16(d2, d12);
0N/A d2 = vis_fpadd16(d2, d22);
0N/A d3 = vis_fpadd16(d3, d03);
0N/A d3 = vis_fpadd16(d3, d13);
0N/A d3 = vis_fpadd16(d3, d23);
0N/A d4 = vis_fpadd16(d4, d04);
0N/A d4 = vis_fpadd16(d4, d14);
0N/A d4 = vis_fpadd16(d4, d24);
0N/A d5 = vis_fpadd16(d5, d05);
0N/A d5 = vis_fpadd16(d5, d15);
0N/A d5 = vis_fpadd16(d5, d25);
0N/A
0N/A buffe[i ] = vis_fpack16_pair(d0, d1);
0N/A buffe[i + 1] = vis_fpack16_pair(d2, d3);
0N/A buffe[i + 2] = vis_fpack16_pair(d4, d5);
0N/A
0N/A buffd[2*i ] = drnd;
0N/A buffd[2*i + 1] = drnd;
0N/A buffd[2*i + 2] = drnd;
0N/A buffd[2*i + 3] = drnd;
0N/A buffd[2*i + 4] = drnd;
0N/A buffd[2*i + 5] = drnd;
0N/A
0N/A LOAD_SRC();
0N/A }
0N/A
0N/A mlib_ImageColorTrue2IndexLine_U8_S16_3((void*)buffe, dl, wid, colormap);
0N/A#endif /* CONV_INDEX */
0N/A
0N/A sl += sll;
0N/A dl += dll;
0N/A
0N/A buff_ind++;
0N/A if (buff_ind >= (KSIZE + 1)) buff_ind = 0;
0N/A }
0N/A
0N/A mlib_free(pbuff);
0N/A
0N/A return MLIB_SUCCESS;
0N/A}
0N/A
0N/A/***************************************************************/
0N/A
0N/A#undef KSIZE
0N/A#define MAX_N 11
0N/A
0N/A#ifdef CONV_INDEX
0N/A
0N/Amlib_status mlib_convMxN_Index3_8_16nw(mlib_image *dst,
0N/A mlib_image *src,
0N/A mlib_s32 m,
0N/A mlib_s32 n,
0N/A mlib_s32 dm,
0N/A mlib_s32 dn,
0N/A mlib_s32 *kern,
0N/A mlib_s32 scale,
0N/A void *colormap)
0N/A
0N/A#else
0N/A
0N/Amlib_status mlib_convMxN_8nw_f(mlib_image *dst,
0N/A mlib_image *src,
0N/A mlib_s32 m,
0N/A mlib_s32 n,
0N/A mlib_s32 dm,
0N/A mlib_s32 dn,
0N/A mlib_s32 *kern,
0N/A mlib_s32 scale)
0N/A
0N/A#endif
0N/A{
0N/A mlib_d64 *buffs_local[3*(MAX_N + 1)], **buffs = buffs_local, **buff;
0N/A mlib_d64 *buff0, *buff1, *buff2, *buff3, *buffn, *buffd, *buffe;
0N/A mlib_d64 s00, s01, s10, s11, s20, s21, s30, s31, s0, s1, s2, s3;
0N/A mlib_d64 d00, d01, d10, d11, d20, d21, d30, d31;
0N/A mlib_d64 dd, d0, d1;
0N/A mlib_s32 ik, jk, ik_last, jk_size, coff, off, doff;
0N/A DEF_VARS;
0N/A DEF_EXTRA_VARS;
0N/A
0N/A if (n > MAX_N) {
0N/A buffs = mlib_malloc(3*(n + 1)*sizeof(mlib_d64*));
0N/A if (buffs == NULL) return MLIB_FAILURE;
0N/A }
0N/A
0N/A buff = buffs + 2*(n + 1);
0N/A
0N/A sl = adr_src;
0N/A#ifdef CONV_INDEX
0N/A dl = adr_dst + dn*dll + dm;
0N/A#else
0N/A dl = adr_dst + dn*dll + dm*NCHAN;
0N/A#endif
0N/A
0N/A ssize = NCHAN*wid;
0N/A dsize = (ssize + 7)/8;
0N/A esize = dsize + 4;
0N/A pbuff = mlib_malloc((n + 4)*esize*sizeof(mlib_d64));
0N/A if (pbuff == NULL) {
0N/A if (buffs != buffs_local) mlib_free(buffs);
0N/A return MLIB_FAILURE;
0N/A }
0N/A
0N/A for (i = 0; i < (n + 1); i++) buffs[i] = pbuff + i*esize;
0N/A for (i = 0; i < (n + 1); i++) buffs[(n + 1) + i] = buffs[i];
0N/A buffd = buffs[n] + esize;
0N/A buffe = buffd + 2*esize;
0N/A
0N/A wid -= (m - 1);
0N/A hgt -= (n - 1);
0N/A xsize = ssize - NCHAN*(m - 1);
0N/A emask = (0xFF00 >> (xsize & 7)) & 0xFF;
0N/A
0N/A vis_write_gsr(gsr_scale + 7);
0N/A
0N/A for (l = 0; l < n; l++) {
0N/A mlib_d64 *buffn = buffs[l];
0N/A sp = sl + l*sll;
0N/A
0N/A#ifndef CONV_INDEX
0N/A if ((mlib_addr)sp & 7) mlib_ImageCopy_na((void*)sp, (void*)buffn, ssize);
0N/A#else
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < dsize; i += 3) {
0N/A LOAD_SRC();
0N/A }
0N/A#endif /* CONV_INDEX */
0N/A }
0N/A
0N/A /* init buffer */
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < (xsize + 7)/8; i++) {
0N/A buffd[2*i ] = drnd;
0N/A buffd[2*i + 1] = drnd;
0N/A }
0N/A
0N/A for (j = 0; j < hgt; j++) {
0N/A mlib_d64 **buffc = buffs + buff_ind;
0N/A mlib_f32 *pk = karr, k0, k1, k2, k3;
0N/A sp = sl + n*sll;
0N/A
0N/A for (l = 0; l < n; l++) {
0N/A buff[l] = buffc[l];
0N/A }
0N/A buffn = buffc[n];
0N/A
0N/A#ifndef CONV_INDEX
0N/A for (l = 0; l < n; l++) {
0N/A if ((((mlib_addr)(sl + l*sll)) & 7) == 0) buff[l] = (mlib_d64*)(sl + l*sll);
0N/A }
0N/A if ((mlib_addr)sp & 7) mlib_ImageCopy_na((void*)sp, (void*)buffn, ssize);
0N/A#endif
0N/A
0N/A#ifdef CONV_INDEX
0N/A ik_last = 0;
0N/A#else
0N/A ik_last = (m - 1);
0N/A#endif
0N/A
0N/A for (jk = 0; jk < n; jk += jk_size) {
0N/A jk_size = n - jk;
0N/A#ifdef CONV_INDEX
0N/A if (jk_size >= 5) jk_size = 3;
0N/A if (jk_size == 4) jk_size = 2;
0N/A#else
0N/A if (jk_size >= 6) jk_size = 4;
0N/A if (jk_size == 5) jk_size = 3;
0N/A#endif
0N/A coff = 0;
0N/A
0N/A if (jk_size == 2) {
0N/A
0N/A for (ik = 0; ik < m; ik++, coff += NCHAN) {
0N/A if (!jk && ik == ik_last) continue;
0N/A
0N/A k0 = pk[ik];
0N/A k1 = pk[ik + m];
0N/A
0N/A doff = coff/8;
0N/A buff0 = buff[jk ] + doff;
0N/A buff1 = buff[jk + 1] + doff;
0N/A
0N/A off = coff & 7;
0N/A vis_write_gsr(gsr_scale + off);
0N/A
0N/A s01 = buff0[0];
0N/A s11 = buff1[0];
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < (xsize + 7)/8; i++) {
0N/A s00 = s01;
0N/A s10 = s11;
0N/A s01 = buff0[i + 1];
0N/A s11 = buff1[i + 1];
0N/A s0 = vis_faligndata(s00, s01);
0N/A s1 = vis_faligndata(s10, s11);
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
0N/A d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
0N/A d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
0N/A d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
0N/A
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A d0 = vis_fpadd16(d00, d0);
0N/A d0 = vis_fpadd16(d10, d0);
0N/A d1 = vis_fpadd16(d01, d1);
0N/A d1 = vis_fpadd16(d11, d1);
0N/A buffd[2*i] = d0;
0N/A buffd[2*i + 1] = d1;
0N/A }
0N/A
0N/A }
0N/A
0N/A pk += 2*m;
0N/A
0N/A } else if (jk_size == 3) {
0N/A
0N/A for (ik = 0; ik < m; ik++, coff += NCHAN) {
0N/A if (!jk && ik == ik_last) continue;
0N/A
0N/A k0 = pk[ik];
0N/A k1 = pk[ik + m];
0N/A k2 = pk[ik + 2*m];
0N/A
0N/A doff = coff/8;
0N/A buff0 = buff[jk ] + doff;
0N/A buff1 = buff[jk + 1] + doff;
0N/A buff2 = buff[jk + 2] + doff;
0N/A
0N/A off = coff & 7;
0N/A vis_write_gsr(gsr_scale + off);
0N/A
0N/A if (off == 0) {
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < (xsize + 7)/8; i++) {
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A
0N/A s0 = buff0[i];
0N/A s1 = buff1[i];
0N/A s2 = buff2[i];
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
0N/A d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
0N/A d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
0N/A d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
0N/A d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
0N/A d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
0N/A
0N/A d00 = vis_fpadd16(d00, d10);
0N/A d0 = vis_fpadd16(d20, d0);
0N/A d0 = vis_fpadd16(d00, d0);
0N/A d01 = vis_fpadd16(d01, d11);
0N/A d1 = vis_fpadd16(d21, d1);
0N/A d1 = vis_fpadd16(d01, d1);
0N/A buffd[2*i] = d0;
0N/A buffd[2*i + 1] = d1;
0N/A }
0N/A
0N/A } else if (off == 4) {
0N/A s01 = buff0[0];
0N/A s11 = buff1[0];
0N/A s21 = buff2[0];
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < (xsize + 7)/8; i++) {
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A
0N/A s00 = s01;
0N/A s10 = s11;
0N/A s20 = s21;
0N/A s01 = buff0[i + 1];
0N/A s11 = buff1[i + 1];
0N/A s21 = buff2[i + 1];
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
0N/A d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
0N/A d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
0N/A d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
0N/A d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
0N/A d21 = vis_fmul8x16au(vis_read_hi(s21), k2);
0N/A
0N/A d00 = vis_fpadd16(d00, d10);
0N/A d0 = vis_fpadd16(d20, d0);
0N/A d0 = vis_fpadd16(d00, d0);
0N/A d01 = vis_fpadd16(d01, d11);
0N/A d1 = vis_fpadd16(d21, d1);
0N/A d1 = vis_fpadd16(d01, d1);
0N/A buffd[2*i] = d0;
0N/A buffd[2*i + 1] = d1;
0N/A }
0N/A
0N/A } else {
0N/A s01 = buff0[0];
0N/A s11 = buff1[0];
0N/A s21 = buff2[0];
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < (xsize + 7)/8; i++) {
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A
0N/A s00 = s01;
0N/A s10 = s11;
0N/A s20 = s21;
0N/A s01 = buff0[i + 1];
0N/A s11 = buff1[i + 1];
0N/A s21 = buff2[i + 1];
0N/A s0 = vis_faligndata(s00, s01);
0N/A s1 = vis_faligndata(s10, s11);
0N/A s2 = vis_faligndata(s20, s21);
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
0N/A d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
0N/A d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
0N/A d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
0N/A d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
0N/A d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
0N/A
0N/A d00 = vis_fpadd16(d00, d10);
0N/A d0 = vis_fpadd16(d20, d0);
0N/A d0 = vis_fpadd16(d00, d0);
0N/A d01 = vis_fpadd16(d01, d11);
0N/A d1 = vis_fpadd16(d21, d1);
0N/A d1 = vis_fpadd16(d01, d1);
0N/A buffd[2*i] = d0;
0N/A buffd[2*i + 1] = d1;
0N/A }
0N/A }
0N/A }
0N/A
0N/A pk += 3*m;
0N/A
0N/A } else { /* jk_size == 4 */
0N/A
0N/A for (ik = 0; ik < m; ik++, coff += NCHAN) {
0N/A if (!jk && ik == ik_last) continue;
0N/A
0N/A k0 = pk[ik];
0N/A k1 = pk[ik + m];
0N/A k2 = pk[ik + 2*m];
0N/A k3 = pk[ik + 3*m];
0N/A
0N/A doff = coff/8;
0N/A buff0 = buff[jk ] + doff;
0N/A buff1 = buff[jk + 1] + doff;
0N/A buff2 = buff[jk + 2] + doff;
0N/A buff3 = buff[jk + 3] + doff;
0N/A
0N/A off = coff & 7;
0N/A vis_write_gsr(gsr_scale + off);
0N/A
0N/A if (off == 0) {
0N/A
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < (xsize + 7)/8; i++) {
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A
0N/A s0 = buff0[i];
0N/A s1 = buff1[i];
0N/A s2 = buff2[i];
0N/A s3 = buff3[i];
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
0N/A d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
0N/A d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
0N/A d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
0N/A d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
0N/A d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
0N/A d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
0N/A d31 = vis_fmul8x16au(vis_read_lo(s3), k3);
0N/A
0N/A d00 = vis_fpadd16(d00, d10);
0N/A d20 = vis_fpadd16(d20, d30);
0N/A d0 = vis_fpadd16(d0, d00);
0N/A d0 = vis_fpadd16(d0, d20);
0N/A d01 = vis_fpadd16(d01, d11);
0N/A d21 = vis_fpadd16(d21, d31);
0N/A d1 = vis_fpadd16(d1, d01);
0N/A d1 = vis_fpadd16(d1, d21);
0N/A buffd[2*i] = d0;
0N/A buffd[2*i + 1] = d1;
0N/A }
0N/A
0N/A } else if (off == 4) {
0N/A
0N/A s01 = buff0[0];
0N/A s11 = buff1[0];
0N/A s21 = buff2[0];
0N/A s31 = buff3[0];
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < (xsize + 7)/8; i++) {
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A
0N/A s00 = s01;
0N/A s10 = s11;
0N/A s20 = s21;
0N/A s30 = s31;
0N/A s01 = buff0[i + 1];
0N/A s11 = buff1[i + 1];
0N/A s21 = buff2[i + 1];
0N/A s31 = buff3[i + 1];
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
0N/A d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
0N/A d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
0N/A d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
0N/A d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
0N/A d21 = vis_fmul8x16au(vis_read_hi(s21), k2);
0N/A d30 = vis_fmul8x16au(vis_read_lo(s30), k3);
0N/A d31 = vis_fmul8x16au(vis_read_hi(s31), k3);
0N/A
0N/A d00 = vis_fpadd16(d00, d10);
0N/A d20 = vis_fpadd16(d20, d30);
0N/A d0 = vis_fpadd16(d0, d00);
0N/A d0 = vis_fpadd16(d0, d20);
0N/A d01 = vis_fpadd16(d01, d11);
0N/A d21 = vis_fpadd16(d21, d31);
0N/A d1 = vis_fpadd16(d1, d01);
0N/A d1 = vis_fpadd16(d1, d21);
0N/A buffd[2*i] = d0;
0N/A buffd[2*i + 1] = d1;
0N/A }
0N/A
0N/A } else {
0N/A
0N/A s01 = buff0[0];
0N/A s11 = buff1[0];
0N/A s21 = buff2[0];
0N/A s31 = buff3[0];
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < (xsize + 7)/8; i++) {
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A
0N/A s00 = s01;
0N/A s10 = s11;
0N/A s20 = s21;
0N/A s30 = s31;
0N/A s01 = buff0[i + 1];
0N/A s11 = buff1[i + 1];
0N/A s21 = buff2[i + 1];
0N/A s31 = buff3[i + 1];
0N/A s0 = vis_faligndata(s00, s01);
0N/A s1 = vis_faligndata(s10, s11);
0N/A s2 = vis_faligndata(s20, s21);
0N/A s3 = vis_faligndata(s30, s31);
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
0N/A d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
0N/A d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
0N/A d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
0N/A d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
0N/A d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
0N/A d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
0N/A d31 = vis_fmul8x16au(vis_read_lo(s3), k3);
0N/A
0N/A d00 = vis_fpadd16(d00, d10);
0N/A d20 = vis_fpadd16(d20, d30);
0N/A d0 = vis_fpadd16(d0, d00);
0N/A d0 = vis_fpadd16(d0, d20);
0N/A d01 = vis_fpadd16(d01, d11);
0N/A d21 = vis_fpadd16(d21, d31);
0N/A d1 = vis_fpadd16(d1, d01);
0N/A d1 = vis_fpadd16(d1, d21);
0N/A buffd[2*i] = d0;
0N/A buffd[2*i + 1] = d1;
0N/A }
0N/A }
0N/A }
0N/A
0N/A pk += 4*m;
0N/A }
0N/A }
0N/A
0N/A /*****************************************
0N/A *****************************************
0N/A ** Final iteration **
0N/A *****************************************
0N/A *****************************************/
0N/A
0N/A jk_size = n;
0N/A#ifdef CONV_INDEX
0N/A if (jk_size >= 5) jk_size = 3;
0N/A if (jk_size == 4) jk_size = 2;
0N/A#else
0N/A if (jk_size >= 6) jk_size = 4;
0N/A if (jk_size == 5) jk_size = 3;
0N/A#endif
0N/A
0N/A k0 = karr[ik_last];
0N/A k1 = karr[ik_last + m];
0N/A k2 = karr[ik_last + 2*m];
0N/A k3 = karr[ik_last + 3*m];
0N/A
0N/A off = ik_last*NCHAN;
0N/A doff = off/8;
0N/A off &= 7;
0N/A buff0 = buff[0] + doff;
0N/A buff1 = buff[1] + doff;
0N/A buff2 = buff[2] + doff;
0N/A buff3 = buff[3] + doff;
0N/A vis_write_gsr(gsr_scale + off);
0N/A
0N/A#ifndef CONV_INDEX
0N/A if (jk_size == 2) {
0N/A dp = ((mlib_addr)dl & 7) ? buffe : (mlib_d64*)dl;
0N/A
0N/A s01 = buff0[0];
0N/A s11 = buff1[0];
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < xsize/8; i++) {
0N/A s00 = s01;
0N/A s10 = s11;
0N/A s01 = buff0[i + 1];
0N/A s11 = buff1[i + 1];
0N/A s0 = vis_faligndata(s00, s01);
0N/A s1 = vis_faligndata(s10, s11);
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
0N/A d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
0N/A d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
0N/A d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
0N/A
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A d0 = vis_fpadd16(d0, d00);
0N/A d0 = vis_fpadd16(d0, d10);
0N/A d1 = vis_fpadd16(d1, d01);
0N/A d1 = vis_fpadd16(d1, d11);
0N/A
0N/A dd = vis_fpack16_pair(d0, d1);
0N/A dp[i] = dd;
0N/A
0N/A buffd[2*i ] = drnd;
0N/A buffd[2*i + 1] = drnd;
0N/A }
0N/A
0N/A if (emask) {
0N/A s00 = s01;
0N/A s10 = s11;
0N/A s01 = buff0[i + 1];
0N/A s11 = buff1[i + 1];
0N/A s0 = vis_faligndata(s00, s01);
0N/A s1 = vis_faligndata(s10, s11);
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
0N/A d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
0N/A d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
0N/A d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
0N/A
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A d0 = vis_fpadd16(d0, d00);
0N/A d0 = vis_fpadd16(d0, d10);
0N/A d1 = vis_fpadd16(d1, d01);
0N/A d1 = vis_fpadd16(d1, d11);
0N/A
0N/A dd = vis_fpack16_pair(d0, d1);
0N/A vis_pst_8(dd, dp + i, emask);
0N/A
0N/A buffd[2*i ] = drnd;
0N/A buffd[2*i + 1] = drnd;
0N/A }
0N/A
0N/A if ((mlib_u8*)dp != dl) mlib_ImageCopy_na((void*)buffe, dl, xsize);
0N/A
0N/A } else if (jk_size == 3) {
0N/A
0N/A dp = ((mlib_addr)dl & 7) ? buffe : (mlib_d64*)dl;
0N/A
0N/A s01 = buff0[0];
0N/A s11 = buff1[0];
0N/A s21 = buff2[0];
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < xsize/8; i++) {
0N/A s00 = s01;
0N/A s10 = s11;
0N/A s20 = s21;
0N/A s01 = buff0[i + 1];
0N/A s11 = buff1[i + 1];
0N/A s21 = buff2[i + 1];
0N/A s0 = vis_faligndata(s00, s01);
0N/A s1 = vis_faligndata(s10, s11);
0N/A s2 = vis_faligndata(s20, s21);
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
0N/A d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
0N/A d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
0N/A d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
0N/A d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
0N/A d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
0N/A
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A d0 = vis_fpadd16(d0, d00);
0N/A d0 = vis_fpadd16(d0, d10);
0N/A d0 = vis_fpadd16(d0, d20);
0N/A d1 = vis_fpadd16(d1, d01);
0N/A d1 = vis_fpadd16(d1, d11);
0N/A d1 = vis_fpadd16(d1, d21);
0N/A
0N/A dd = vis_fpack16_pair(d0, d1);
0N/A dp[i] = dd;
0N/A
0N/A buffd[2*i ] = drnd;
0N/A buffd[2*i + 1] = drnd;
0N/A }
0N/A
0N/A if (emask) {
0N/A s00 = s01;
0N/A s10 = s11;
0N/A s20 = s21;
0N/A s01 = buff0[i + 1];
0N/A s11 = buff1[i + 1];
0N/A s21 = buff2[i + 1];
0N/A s0 = vis_faligndata(s00, s01);
0N/A s1 = vis_faligndata(s10, s11);
0N/A s2 = vis_faligndata(s20, s21);
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
0N/A d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
0N/A d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
0N/A d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
0N/A d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
0N/A d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
0N/A
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A d0 = vis_fpadd16(d0, d00);
0N/A d0 = vis_fpadd16(d0, d10);
0N/A d0 = vis_fpadd16(d0, d20);
0N/A d1 = vis_fpadd16(d1, d01);
0N/A d1 = vis_fpadd16(d1, d11);
0N/A d1 = vis_fpadd16(d1, d21);
0N/A
0N/A dd = vis_fpack16_pair(d0, d1);
0N/A vis_pst_8(dd, dp + i, emask);
0N/A
0N/A buffd[2*i ] = drnd;
0N/A buffd[2*i + 1] = drnd;
0N/A }
0N/A
0N/A if ((mlib_u8*)dp != dl) mlib_ImageCopy_na((void*)buffe, dl, xsize);
0N/A
0N/A } else /* if (jk_size == 4) */ {
0N/A
0N/A dp = ((mlib_addr)dl & 7) ? buffe : (mlib_d64*)dl;
0N/A
0N/A s01 = buff0[0];
0N/A s11 = buff1[0];
0N/A s21 = buff2[0];
0N/A s31 = buff3[0];
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < xsize/8; i++) {
0N/A s00 = s01;
0N/A s10 = s11;
0N/A s20 = s21;
0N/A s30 = s31;
0N/A s01 = buff0[i + 1];
0N/A s11 = buff1[i + 1];
0N/A s21 = buff2[i + 1];
0N/A s31 = buff3[i + 1];
0N/A s0 = vis_faligndata(s00, s01);
0N/A s1 = vis_faligndata(s10, s11);
0N/A s2 = vis_faligndata(s20, s21);
0N/A s3 = vis_faligndata(s30, s31);
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
0N/A d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
0N/A d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
0N/A d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
0N/A d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
0N/A d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
0N/A d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
0N/A d31 = vis_fmul8x16au(vis_read_lo(s3), k3);
0N/A
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A d0 = vis_fpadd16(d0, d00);
0N/A d0 = vis_fpadd16(d0, d10);
0N/A d0 = vis_fpadd16(d0, d20);
0N/A d0 = vis_fpadd16(d0, d30);
0N/A d1 = vis_fpadd16(d1, d01);
0N/A d1 = vis_fpadd16(d1, d11);
0N/A d1 = vis_fpadd16(d1, d21);
0N/A d1 = vis_fpadd16(d1, d31);
0N/A
0N/A dd = vis_fpack16_pair(d0, d1);
0N/A dp[i] = dd;
0N/A
0N/A buffd[2*i ] = drnd;
0N/A buffd[2*i + 1] = drnd;
0N/A }
0N/A
0N/A if (emask) {
0N/A s00 = s01;
0N/A s10 = s11;
0N/A s20 = s21;
0N/A s30 = s31;
0N/A s01 = buff0[i + 1];
0N/A s11 = buff1[i + 1];
0N/A s21 = buff2[i + 1];
0N/A s31 = buff3[i + 1];
0N/A s0 = vis_faligndata(s00, s01);
0N/A s1 = vis_faligndata(s10, s11);
0N/A s2 = vis_faligndata(s20, s21);
0N/A s3 = vis_faligndata(s30, s31);
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
0N/A d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
0N/A d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
0N/A d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
0N/A d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
0N/A d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
0N/A d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
0N/A d31 = vis_fmul8x16au(vis_read_lo(s3), k3);
0N/A
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A d0 = vis_fpadd16(d0, d00);
0N/A d0 = vis_fpadd16(d0, d10);
0N/A d0 = vis_fpadd16(d0, d20);
0N/A d0 = vis_fpadd16(d0, d30);
0N/A d1 = vis_fpadd16(d1, d01);
0N/A d1 = vis_fpadd16(d1, d11);
0N/A d1 = vis_fpadd16(d1, d21);
0N/A d1 = vis_fpadd16(d1, d31);
0N/A
0N/A dd = vis_fpack16_pair(d0, d1);
0N/A vis_pst_8(dd, dp + i, emask);
0N/A
0N/A buffd[2*i ] = drnd;
0N/A buffd[2*i + 1] = drnd;
0N/A }
0N/A
0N/A if ((mlib_u8*)dp != dl) mlib_ImageCopy_na((void*)buffe, dl, xsize);
0N/A }
0N/A
0N/A#else /* CONV_INDEX */
0N/A
0N/A if (jk_size == 2) {
0N/A vis_write_gsr(gsr_scale + 7);
0N/A
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < dsize; i += 3) {
0N/A mlib_d64 d00, d01, d02, d03, d04, d05;
0N/A mlib_d64 d10, d11, d12, d13, d14, d15;
0N/A mlib_d64 d0, d1, d2, d3, d4, d5;
0N/A mlib_d64 s00 = buff0[i];
0N/A mlib_d64 s01 = buff0[i + 1];
0N/A mlib_d64 s02 = buff0[i + 2];
0N/A mlib_d64 s10 = buff1[i];
0N/A mlib_d64 s11 = buff1[i + 1];
0N/A mlib_d64 s12 = buff1[i + 2];
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_hi(s00), k0);
0N/A d01 = vis_fmul8x16au(vis_read_lo(s00), k0);
0N/A d02 = vis_fmul8x16au(vis_read_hi(s01), k0);
0N/A d03 = vis_fmul8x16au(vis_read_lo(s01), k0);
0N/A d04 = vis_fmul8x16au(vis_read_hi(s02), k0);
0N/A d05 = vis_fmul8x16au(vis_read_lo(s02), k0);
0N/A d10 = vis_fmul8x16au(vis_read_hi(s10), k1);
0N/A d11 = vis_fmul8x16au(vis_read_lo(s10), k1);
0N/A d12 = vis_fmul8x16au(vis_read_hi(s11), k1);
0N/A d13 = vis_fmul8x16au(vis_read_lo(s11), k1);
0N/A d14 = vis_fmul8x16au(vis_read_hi(s12), k1);
0N/A d15 = vis_fmul8x16au(vis_read_lo(s12), k1);
0N/A
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A d2 = buffd[2*i + 2];
0N/A d3 = buffd[2*i + 3];
0N/A d4 = buffd[2*i + 4];
0N/A d5 = buffd[2*i + 5];
0N/A d0 = vis_fpadd16(d0, d00);
0N/A d0 = vis_fpadd16(d0, d10);
0N/A d1 = vis_fpadd16(d1, d01);
0N/A d1 = vis_fpadd16(d1, d11);
0N/A d2 = vis_fpadd16(d2, d02);
0N/A d2 = vis_fpadd16(d2, d12);
0N/A d3 = vis_fpadd16(d3, d03);
0N/A d3 = vis_fpadd16(d3, d13);
0N/A d4 = vis_fpadd16(d4, d04);
0N/A d4 = vis_fpadd16(d4, d14);
0N/A d5 = vis_fpadd16(d5, d05);
0N/A d5 = vis_fpadd16(d5, d15);
0N/A
0N/A buffe[i ] = vis_fpack16_pair(d0, d1);
0N/A buffe[i + 1] = vis_fpack16_pair(d2, d3);
0N/A buffe[i + 2] = vis_fpack16_pair(d4, d5);
0N/A
0N/A buffd[2*i ] = drnd;
0N/A buffd[2*i + 1] = drnd;
0N/A buffd[2*i + 2] = drnd;
0N/A buffd[2*i + 3] = drnd;
0N/A buffd[2*i + 4] = drnd;
0N/A buffd[2*i + 5] = drnd;
0N/A
0N/A LOAD_SRC();
0N/A }
0N/A
0N/A } else /* if (jk_size == 3) */ {
0N/A vis_write_gsr(gsr_scale + 7);
0N/A
0N/A#pragma pipeloop(0)
0N/A for (i = 0; i < dsize; i += 3) {
0N/A mlib_d64 d00, d01, d02, d03, d04, d05;
0N/A mlib_d64 d10, d11, d12, d13, d14, d15;
0N/A mlib_d64 d20, d21, d22, d23, d24, d25;
0N/A mlib_d64 d0, d1, d2, d3, d4, d5;
0N/A mlib_d64 s00 = buff0[i];
0N/A mlib_d64 s01 = buff0[i + 1];
0N/A mlib_d64 s02 = buff0[i + 2];
0N/A mlib_d64 s10 = buff1[i];
0N/A mlib_d64 s11 = buff1[i + 1];
0N/A mlib_d64 s12 = buff1[i + 2];
0N/A mlib_d64 s20 = buff2[i];
0N/A mlib_d64 s21 = buff2[i + 1];
0N/A mlib_d64 s22 = buff2[i + 2];
0N/A
0N/A d00 = vis_fmul8x16au(vis_read_hi(s00), k0);
0N/A d01 = vis_fmul8x16au(vis_read_lo(s00), k0);
0N/A d02 = vis_fmul8x16au(vis_read_hi(s01), k0);
0N/A d03 = vis_fmul8x16au(vis_read_lo(s01), k0);
0N/A d04 = vis_fmul8x16au(vis_read_hi(s02), k0);
0N/A d05 = vis_fmul8x16au(vis_read_lo(s02), k0);
0N/A d10 = vis_fmul8x16au(vis_read_hi(s10), k1);
0N/A d11 = vis_fmul8x16au(vis_read_lo(s10), k1);
0N/A d12 = vis_fmul8x16au(vis_read_hi(s11), k1);
0N/A d13 = vis_fmul8x16au(vis_read_lo(s11), k1);
0N/A d14 = vis_fmul8x16au(vis_read_hi(s12), k1);
0N/A d15 = vis_fmul8x16au(vis_read_lo(s12), k1);
0N/A d20 = vis_fmul8x16au(vis_read_hi(s20), k2);
0N/A d21 = vis_fmul8x16au(vis_read_lo(s20), k2);
0N/A d22 = vis_fmul8x16au(vis_read_hi(s21), k2);
0N/A d23 = vis_fmul8x16au(vis_read_lo(s21), k2);
0N/A d24 = vis_fmul8x16au(vis_read_hi(s22), k2);
0N/A d25 = vis_fmul8x16au(vis_read_lo(s22), k2);
0N/A
0N/A d0 = buffd[2*i];
0N/A d1 = buffd[2*i + 1];
0N/A d2 = buffd[2*i + 2];
0N/A d3 = buffd[2*i + 3];
0N/A d4 = buffd[2*i + 4];
0N/A d5 = buffd[2*i + 5];
0N/A d0 = vis_fpadd16(d0, d00);
0N/A d0 = vis_fpadd16(d0, d10);
0N/A d0 = vis_fpadd16(d0, d20);
0N/A d1 = vis_fpadd16(d1, d01);
0N/A d1 = vis_fpadd16(d1, d11);
0N/A d1 = vis_fpadd16(d1, d21);
0N/A d2 = vis_fpadd16(d2, d02);
0N/A d2 = vis_fpadd16(d2, d12);
0N/A d2 = vis_fpadd16(d2, d22);
0N/A d3 = vis_fpadd16(d3, d03);
0N/A d3 = vis_fpadd16(d3, d13);
0N/A d3 = vis_fpadd16(d3, d23);
0N/A d4 = vis_fpadd16(d4, d04);
0N/A d4 = vis_fpadd16(d4, d14);
0N/A d4 = vis_fpadd16(d4, d24);
0N/A d5 = vis_fpadd16(d5, d05);
0N/A d5 = vis_fpadd16(d5, d15);
0N/A d5 = vis_fpadd16(d5, d25);
0N/A
0N/A buffe[i ] = vis_fpack16_pair(d0, d1);
0N/A buffe[i + 1] = vis_fpack16_pair(d2, d3);
0N/A buffe[i + 2] = vis_fpack16_pair(d4, d5);
0N/A
0N/A buffd[2*i ] = drnd;
0N/A buffd[2*i + 1] = drnd;
0N/A buffd[2*i + 2] = drnd;
0N/A buffd[2*i + 3] = drnd;
0N/A buffd[2*i + 4] = drnd;
0N/A buffd[2*i + 5] = drnd;
0N/A
0N/A LOAD_SRC();
0N/A }
0N/A }
0N/A#endif /* CONV_INDEX */
0N/A
0N/A#ifdef CONV_INDEX
0N/A mlib_ImageColorTrue2IndexLine_U8_S16_3((void*)buffe, dl, wid, colormap);
0N/A#endif /* CONV_INDEX */
0N/A
0N/A sl += sll;
0N/A dl += dll;
0N/A
0N/A buff_ind++;
0N/A if (buff_ind >= (n + 1)) buff_ind = 0;
0N/A }
0N/A
0N/A mlib_free(pbuff);
0N/A if (buffs != buffs_local) mlib_free(buffs);
0N/A
0N/A return MLIB_SUCCESS;
0N/A}
0N/A
0N/A/***************************************************************/