From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
	id A46B33858C2F; Tue, 31 Oct 2023 12:08:24 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org A46B33858C2F
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1698754104;
	bh=dSvIheAlm9gS7h28zUpldBvQ36YM4lz7AqHVyG1/jeE=;
	h=From:To:Subject:Date:In-Reply-To:References:From;
	b=C8o+tnl3CQp6MsPqd7l37wQUpC9WBus60Cw1R0Of6sBFBdf4BqWzI9HzEXxLUBmxQ
	 /5cY+Lb33pnpvxePCibez6iNYq9tBvopM0TmwDcBwFJ1SKYuQIajHVvdoQ7sTMVQa4
	 mb+YLbMxQXkuRC8XFysz7+HuAWEehEJ8ljB08BvI=
From: "zhangjungcc at gmail dot com" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug middle-end/110015] openjpeg is slower when built with gcc13
 compared to clang16
Date: Tue, 31 Oct 2023 12:08:23 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: changed
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: middle-end
X-Bugzilla-Version: 14.0
X-Bugzilla-Keywords: missed-optimization
X-Bugzilla-Severity: normal
X-Bugzilla-Who: zhangjungcc at gmail dot com
X-Bugzilla-Status: UNCONFIRMED
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: cc
Message-ID: <bug-110015-4-Xtv0QSvBo2@http.gcc.gnu.org/bugzilla/>
In-Reply-To: <bug-110015-4@http.gcc.gnu.org/bugzilla/>
References: <bug-110015-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
List-Id: <gcc-bugs.sourceware.org>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D110015

jun zhang <zhangjungcc at gmail dot com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |zhangjungcc at gmail dot c=
om
--- Comment #2 from jun zhang <zhangjungcc at gmail dot com> ---
  The following loop couldn't vectorize in gcc, but could in llvm. it has 3%
improvement.
more info, please refer: https://godbolt.org/z/zMbjq41h5

#include<string.h>
typedef signed int  OPJ_INT32;
typedef unsigned int OPJ_UINT32;
typedef int OPJ_BOOL;
#define OPJ_TRUE 1
#define OPJ_FALSE 0
typedef char          OPJ_CHAR;
typedef float         OPJ_FLOAT32;
typedef double        OPJ_FLOAT64;
typedef unsigned char OPJ_BYTE;
#define T1_NMSEDEC_FRACBITS 6
#define OPJ_RESTRICT restrict
#define OPJ_TLS_KEY_T1  0
#include <stdio.h>
typedef size_t   OPJ_SIZE_T;

typedef struct opj_tcd_cblk_enc {
    OPJ_BYTE* data;               /* Data */
//    opj_tcd_layer_t* layers;      /* layer information */
//    opj_tcd_pass_t* passes;       /* information about the passes */
    OPJ_INT32 x0, y0, x1,
              y1;     /* dimension of the code-blocks : left upper corner (=
x0,
y0) right low corner (x1,y1) */
    OPJ_UINT32 numbps;
    OPJ_UINT32 numlenbits;
    OPJ_UINT32 data_size;         /* Size of allocated data buffer */
    OPJ_UINT32
    numpasses;         /* number of pass already done for the code-blocks */
    OPJ_UINT32 numpassesinlayers; /* number of passes in the layer */
    OPJ_UINT32 totalpasses;       /* total number of passes */
} opj_tcd_cblk_enc_t;
typedef struct opj_t1 {

    /** MQC component */
//    opj_mqc_t mqc;

    OPJ_INT32  *data;
    /** Flags used by decoder and encoder.
     * Such that flags[1+0] is for state of col=3D0,row=3D0..3,
       flags[1+1] for col=3D1, row=3D0..3, flags[1+flags_stride] for
col=3D0,row=3D4..7, ...
       This array avoids too much cache trashing when processing by 4 verti=
cal
samples
       as done in the various decoding steps. */
//    opj_flag_t *flags;

    OPJ_UINT32 w;
    OPJ_UINT32 h;
    OPJ_UINT32 datasize;
    OPJ_UINT32 flagssize;
    OPJ_BOOL   encoder;

    /* Thre 3 variables below are only used by the decoder */
    /* set to TRUE in multithreaded context */
    OPJ_BOOL     mustuse_cblkdatabuffer;
    /* Temporary buffer to concatenate all chunks of a codebock */
    OPJ_BYTE    *cblkdatabuffer;
    /* Maximum size available in cblkdatabuffer */
    OPJ_UINT32   cblkdatabuffersize;
} opj_t1_t;

#define INLINE __inline__
static INLINE OPJ_INT32 opj_int_max(OPJ_INT32 a, OPJ_INT32 b)
{
    return (a > b) ? a : b;
}
#define opj_to_smr(x)   ((x) >=3D 0 ? (OPJ_UINT32)(x) : ((OPJ_UINT32)(-x) |
0x80000000U))
OPJ_FLOAT64 opj_t1_encode_cblk(opj_t1_t *t1,
                                      opj_tcd_cblk_enc_t* cblk,
                                      OPJ_UINT32 orient,
                                      OPJ_UINT32 compno,
                                      OPJ_UINT32 level,
                                      OPJ_UINT32 qmfbid,
                                      OPJ_FLOAT64 stepsize,
                                      OPJ_UINT32 cblksty,
                                      OPJ_UINT32 numcomps,
                                      const OPJ_FLOAT64 * mct_norms,
                                      OPJ_UINT32 mct_numcomps)
{
    OPJ_INT32 max;
    OPJ_UINT32 i, j;
    OPJ_INT32* datap;

    max =3D 0;
    datap =3D t1->data;
    for (j =3D 0; j < t1->h; ++j) {
        const OPJ_UINT32 w =3D t1->w;
        for (i =3D 0; i < w; ++i, ++datap) {
            OPJ_INT32 tmp =3D *datap;
            if (tmp < 0) {
                OPJ_UINT32 tmp_unsigned;
                max =3D opj_int_max(max, -tmp);
                tmp_unsigned =3D opj_to_smr(tmp);
                memcpy(datap, &tmp_unsigned, sizeof(OPJ_INT32));
            } else {
                max =3D opj_int_max(max, tmp);
            }
        }
    }
        cblk->numbps =3D max ? 6 : 0;
}=