From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id A46B33858C2F; Tue, 31 Oct 2023 12:08:24 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org A46B33858C2F DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1698754104; bh=dSvIheAlm9gS7h28zUpldBvQ36YM4lz7AqHVyG1/jeE=; h=From:To:Subject:Date:In-Reply-To:References:From; b=C8o+tnl3CQp6MsPqd7l37wQUpC9WBus60Cw1R0Of6sBFBdf4BqWzI9HzEXxLUBmxQ /5cY+Lb33pnpvxePCibez6iNYq9tBvopM0TmwDcBwFJ1SKYuQIajHVvdoQ7sTMVQa4 mb+YLbMxQXkuRC8XFysz7+HuAWEehEJ8ljB08BvI= From: "zhangjungcc at gmail dot com" To: gcc-bugs@gcc.gnu.org Subject: [Bug middle-end/110015] openjpeg is slower when built with gcc13 compared to clang16 Date: Tue, 31 Oct 2023 12:08:23 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: middle-end X-Bugzilla-Version: 14.0 X-Bugzilla-Keywords: missed-optimization X-Bugzilla-Severity: normal X-Bugzilla-Who: zhangjungcc at gmail dot com X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: cc Message-ID: In-Reply-To: References: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 List-Id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D110015 jun zhang changed: What |Removed |Added ---------------------------------------------------------------------------- CC| |zhangjungcc at gmail dot c= om --- Comment #2 from jun zhang --- The following loop couldn't vectorize in gcc, but could in llvm. it has 3% improvement. more info, please refer: https://godbolt.org/z/zMbjq41h5 #include typedef signed int OPJ_INT32; typedef unsigned int OPJ_UINT32; typedef int OPJ_BOOL; #define OPJ_TRUE 1 #define OPJ_FALSE 0 typedef char OPJ_CHAR; typedef float OPJ_FLOAT32; typedef double OPJ_FLOAT64; typedef unsigned char OPJ_BYTE; #define T1_NMSEDEC_FRACBITS 6 #define OPJ_RESTRICT restrict #define OPJ_TLS_KEY_T1 0 #include typedef size_t OPJ_SIZE_T; typedef struct opj_tcd_cblk_enc { OPJ_BYTE* data; /* Data */ // opj_tcd_layer_t* layers; /* layer information */ // opj_tcd_pass_t* passes; /* information about the passes */ OPJ_INT32 x0, y0, x1, y1; /* dimension of the code-blocks : left upper corner (= x0, y0) right low corner (x1,y1) */ OPJ_UINT32 numbps; OPJ_UINT32 numlenbits; OPJ_UINT32 data_size; /* Size of allocated data buffer */ OPJ_UINT32 numpasses; /* number of pass already done for the code-blocks */ OPJ_UINT32 numpassesinlayers; /* number of passes in the layer */ OPJ_UINT32 totalpasses; /* total number of passes */ } opj_tcd_cblk_enc_t; typedef struct opj_t1 { /** MQC component */ // opj_mqc_t mqc; OPJ_INT32 *data; /** Flags used by decoder and encoder. * Such that flags[1+0] is for state of col=3D0,row=3D0..3, flags[1+1] for col=3D1, row=3D0..3, flags[1+flags_stride] for col=3D0,row=3D4..7, ... This array avoids too much cache trashing when processing by 4 verti= cal samples as done in the various decoding steps. */ // opj_flag_t *flags; OPJ_UINT32 w; OPJ_UINT32 h; OPJ_UINT32 datasize; OPJ_UINT32 flagssize; OPJ_BOOL encoder; /* Thre 3 variables below are only used by the decoder */ /* set to TRUE in multithreaded context */ OPJ_BOOL mustuse_cblkdatabuffer; /* Temporary buffer to concatenate all chunks of a codebock */ OPJ_BYTE *cblkdatabuffer; /* Maximum size available in cblkdatabuffer */ OPJ_UINT32 cblkdatabuffersize; } opj_t1_t; #define INLINE __inline__ static INLINE OPJ_INT32 opj_int_max(OPJ_INT32 a, OPJ_INT32 b) { return (a > b) ? a : b; } #define opj_to_smr(x) ((x) >=3D 0 ? (OPJ_UINT32)(x) : ((OPJ_UINT32)(-x) | 0x80000000U)) OPJ_FLOAT64 opj_t1_encode_cblk(opj_t1_t *t1, opj_tcd_cblk_enc_t* cblk, OPJ_UINT32 orient, OPJ_UINT32 compno, OPJ_UINT32 level, OPJ_UINT32 qmfbid, OPJ_FLOAT64 stepsize, OPJ_UINT32 cblksty, OPJ_UINT32 numcomps, const OPJ_FLOAT64 * mct_norms, OPJ_UINT32 mct_numcomps) { OPJ_INT32 max; OPJ_UINT32 i, j; OPJ_INT32* datap; max =3D 0; datap =3D t1->data; for (j =3D 0; j < t1->h; ++j) { const OPJ_UINT32 w =3D t1->w; for (i =3D 0; i < w; ++i, ++datap) { OPJ_INT32 tmp =3D *datap; if (tmp < 0) { OPJ_UINT32 tmp_unsigned; max =3D opj_int_max(max, -tmp); tmp_unsigned =3D opj_to_smr(tmp); memcpy(datap, &tmp_unsigned, sizeof(OPJ_INT32)); } else { max =3D opj_int_max(max, tmp); } } } cblk->numbps =3D max ? 6 : 0; }=