From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <Tamar.Christina@arm.com>
Received: from EUR05-AM6-obe.outbound.protection.outlook.com
 (mail-am6eur05on2063.outbound.protection.outlook.com [40.107.22.63])
 by sourceware.org (Postfix) with ESMTPS id 0F6FA3858022
 for <gcc-patches@gcc.gnu.org>; Tue, 31 Aug 2021 13:30:25 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 0F6FA3858022
Received: from AM6P195CA0014.EURP195.PROD.OUTLOOK.COM (2603:10a6:209:81::27)
 by AS8PR08MB7173.eurprd08.prod.outlook.com (2603:10a6:20b:404::8) with
 Microsoft SMTP Server (version=TLS1_2,
 cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.4457.23; Tue, 31 Aug
 2021 13:30:19 +0000
Received: from VE1EUR03FT039.eop-EUR03.prod.protection.outlook.com
 (2603:10a6:209:81:cafe::c5) by AM6P195CA0014.outlook.office365.com
 (2603:10a6:209:81::27) with Microsoft SMTP Server (version=TLS1_2,
 cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.4478.17 via Frontend
 Transport; Tue, 31 Aug 2021 13:30:19 +0000
X-MS-Exchange-Authentication-Results: spf=pass (sender IP is 63.35.35.123)
 smtp.mailfrom=arm.com; gcc.gnu.org; dkim=pass (signature was verified)
 header.d=armh.onmicrosoft.com;gcc.gnu.org; dmarc=pass action=none
 header.from=arm.com;
Received-SPF: Pass (protection.outlook.com: domain of arm.com designates
 63.35.35.123 as permitted sender) receiver=protection.outlook.com;
 client-ip=63.35.35.123; helo=64aa7808-outbound-1.mta.getcheckrecipient.com;
Received: from 64aa7808-outbound-1.mta.getcheckrecipient.com (63.35.35.123) by
 VE1EUR03FT039.mail.protection.outlook.com (10.152.19.196) with
 Microsoft SMTP
 Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id
 15.20.4457.17 via Frontend Transport; Tue, 31 Aug 2021 13:30:18 +0000
Received: ("Tessian outbound 56612e04f172:v103");
 Tue, 31 Aug 2021 13:30:18 +0000
X-CheckRecipientChecked: true
X-CR-MTA-CID: 4605509b256a9f9a
X-CR-MTA-TID: 64aa7808
Received: from 2995964353a3.1
 by 64aa7808-outbound-1.mta.getcheckrecipient.com id
 15DDAC8D-9C0B-41FF-A184-C4CE6EAAD251.1; 
 Tue, 31 Aug 2021 13:30:05 +0000
Received: from EUR04-VI1-obe.outbound.protection.outlook.com
 by 64aa7808-outbound-1.mta.getcheckrecipient.com with ESMTPS id 2995964353a3.1
 (version=TLSv1.2 cipher=ECDHE-RSA-AES256-GCM-SHA384);
 Tue, 31 Aug 2021 13:30:05 +0000
ARC-Seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none;
 b=dTi7tEeBLwX3FjTraeFqXkvaVBQr2LDG+avZijAV+xwm1JROL1vZtA4K2gUc/DCnpezrmmduBQew4O22FntK0HsArmSpYTtTfMuBGsvZCpwhFpH/buBZcCOEr2BIl8M7Ik4y+rBEvhET/OU3PmW8BEr2t5m/olgztGG/GarzT742gYDIBHix3MBRQEoPznTBf4MNNk+OKqS+MsIsHXZW8BjdYuz4riXGBPJRFzBJIbuOlemtAz8gsKwyVnRTlBaSLcL/9NROXW6QcpOH6ou+7/UY/kMskvK+LwqBlYnQy1LZ2N0Fh1wJH1UyBSEQM0ePupTG/0U5dvFZZtdSXun3gw==
ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; 
 s=arcselector9901;
 h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck;
 bh=6JBwySxAyE+tb/8jMmadLx8AL2iYcNF5uBS6NJ197hc=;
 b=QQcE70nE05F9VH5TBQe4hIds45+k3JHYySt4GNZRmMxWwK4SC3O1JPgqdUACwT33V06tZsWs6r3pEl3KHV7KatXr+YFOKKclItu8stNbUR4mgEv7o8FMgo178SNV4BIfl352FSmxoKjJgLYB0WMXE6Vp4E7K4ctjWquAN5ybUNxRxvpbQFcjKWqzUqOIiFcSkjpBOsf7MIPphXMhVSHf9A2EcKyyows2MXyq71Awwk0S3QOCKiXAk0F/KtDEVLtYYYMEKczTpuHiN78suXYjO6mu4/Aa/SxkTHS0iLZ/k8EhlRIZqFG52niby+fMwx+6eqjz25TZ+d/N/6Kr4rAorA==
ARC-Authentication-Results: i=1; mx.microsoft.com 1; spf=pass
 smtp.mailfrom=arm.com; dmarc=pass action=none header.from=arm.com; dkim=pass
 header.d=arm.com; arc=none
Authentication-Results-Original: gcc.gnu.org; dkim=none (message not signed)
 header.d=none;gcc.gnu.org; dmarc=none action=none header.from=arm.com;
Received: from VI1PR08MB5325.eurprd08.prod.outlook.com (2603:10a6:803:13e::17)
 by VE1PR08MB5760.eurprd08.prod.outlook.com (2603:10a6:800:1af::10)
 with Microsoft SMTP Server (version=TLS1_2,
 cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.4457.23; Tue, 31 Aug
 2021 13:30:02 +0000
Received: from VI1PR08MB5325.eurprd08.prod.outlook.com
 ([fe80::bd45:5ad5:f666:272a]) by VI1PR08MB5325.eurprd08.prod.outlook.com
 ([fe80::bd45:5ad5:f666:272a%4]) with mapi id 15.20.4457.024; Tue, 31 Aug 2021
 13:30:02 +0000
Date: Tue, 31 Aug 2021 14:30:00 +0100
From: Tamar Christina <tamar.christina@arm.com>
To: gcc-patches@gcc.gnu.org
Cc: nd@arm.com, Richard.Earnshaw@arm.com, Marcus.Shawcroft@arm.com,
 Kyrylo.Tkachov@arm.com, richard.sandiford@arm.com
Subject: [PATCH 2/2]AArch64: Add better costing for vector constants and
 operations 
Message-ID: <patch-14774-tamar@arm.com>
Content-Type: multipart/mixed; boundary="J/dobhs11T7y2rNN"
Content-Disposition: inline
User-Agent: Mutt/1.9.4 (2018-02-28)
X-ClientProxiedBy: LO4P123CA0125.GBRP123.PROD.OUTLOOK.COM
 (2603:10a6:600:192::22) To VI1PR08MB5325.eurprd08.prod.outlook.com
 (2603:10a6:803:13e::17)
MIME-Version: 1.0
X-MS-Exchange-MessageSentRepresentingType: 1
Received: from arm.com (217.140.106.53) by
 LO4P123CA0125.GBRP123.PROD.OUTLOOK.COM (2603:10a6:600:192::22) with Microsoft
 SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id
 15.20.4478.17 via Frontend Transport; Tue, 31 Aug 2021 13:30:02 +0000
X-MS-PublicTrafficType: Email
X-MS-Office365-Filtering-Correlation-Id: 1f458e7e-6bcf-4aba-1037-08d96c8379c7
X-MS-TrafficTypeDiagnostic: VE1PR08MB5760:|AS8PR08MB7173:
X-MS-Exchange-Transport-Forked: True
X-Microsoft-Antispam-PRVS: <AS8PR08MB7173D3FF196DB35C0C2F5F8CFFCC9@AS8PR08MB7173.eurprd08.prod.outlook.com>
x-checkrecipientrouted: true
NoDisclaimer: true
X-MS-Oob-TLC-OOBClassifiers: OLM:8882;OLM:8882;
X-MS-Exchange-SenderADCheck: 1
X-MS-Exchange-AntiSpam-Relay: 0
X-Microsoft-Antispam-Untrusted: BCL:0;
X-Microsoft-Antispam-Message-Info-Original: Y6qPaclcM5rmT7xbdLLzWRPCBYt9q3ZWn+EtyWZSyWXbhm8Zb54/KZicTzKoGOo5ncjkXt2GJ9ixaCZXeovCwJFRk/tFUwB2EynBWpHU1ZW5OwEx9F+hDTzE+jZ9cd1vWaQx3BPVkGAB7ga8WYeu27+slvJ5+UwLyV8q7RBtwZ6GEfVqUfVKXye1HmRZ5cCwkMJM22dvZyuYz919KRCwEJnnzKDPf2/on+slhP/yQ4YumvRLelgsoCbk3EMMudc7KrFPS2GK/3faNHMoD+aH/l+Ymvah5v4EvVWhUJRIwxLKhynbZ/7/PrRGbtOB0xX0YYwk1d/3/u05FroPXuQEnXJxH5+mCP0CVP7pFvB4CPqrd2jPSpks7QHTbwue0l1UWeA8X9fRHWoMScNIoAuZVYHh4JQc1f70im9IPsvB8O4/3zGIPWXF7w4BLufIv8JCp8G6lCdJVvQXPoSDyhwz4XutUd2wW161/+RKeGsV7fr+PkTfIZjOW/AR6cARH+C7fsIEBc6FM7UlXIfEVF9LR5rA/2gehjLSh8R4xU6hUx0DeramikJbljM+raUmNSifRcGIxZg1AOZ6xR9X4CSSeg4rt783LA8TR43/otsIkRWwcNc/RG4iiHo6P8qEpA+nGiqCDDtEr1wdR/cWLDHa533HiY3AxPRyHbVPEQ1C3CbmsnQ55jRPODTCEiuR40ozj/vebg6LHDbIxg69+Vc9d3/LSolaWRquwCLmD2AacG06rToAOAyBXomXcBbSUmGCb2ZmCzo5GV2QQxGzGuDs9IgvhDzTvIqaHMhMLQ+Xjmvw2Tm5TI35oEm/MO0TuHdya2p5QI54+IuUOV/rjcN9bg==
X-Forefront-Antispam-Report-Untrusted: CIP:255.255.255.255; CTRY:; LANG:en;
 SCL:1; SRV:; IPV:NLI; SFV:NSPM; H:VI1PR08MB5325.eurprd08.prod.outlook.com;
 PTR:; CAT:NONE;
 SFS:(4636009)(136003)(376002)(39850400004)(366004)(396003)(346002)(4743002)(44832011)(8676002)(956004)(2906002)(30864003)(7696005)(4326008)(2616005)(52116002)(44144004)(66556008)(5660300002)(83380400001)(66476007)(235185007)(26005)(316002)(55016002)(6916009)(33964004)(186003)(8886007)(38100700002)(66946007)(8936002)(38350700002)(478600001)(66616009)(36756003)(86362001)(4216001)(2700100001);
 DIR:OUT; SFP:1101; 
X-MS-Exchange-AntiSpam-MessageData-ChunkCount: 1
X-MS-Exchange-AntiSpam-MessageData-0: =?utf-8?B?U1JieGd1YXZldTJvZUlSK2J6dE5tSDlJell1WUxYQ3gyZFNCWk9JUGRhOFRJ?=
 =?utf-8?B?VzQyTE5uSDNLdjR5aGJXZHVSKzhPT0puWm4wWHp3eXdLWnhiMXNrY1NOMExo?=
 =?utf-8?B?VDZaQmlBNkdYVGU0SDVhN3plYUdWZkZQZGdBQzgvVm1GUFlaT1poNzdFdGFn?=
 =?utf-8?B?aVEzOXhsTjgySWFiMGJSb1hMZGpwV3lBRGpaTllKUFgzTGt4SXQvb0xQdDlX?=
 =?utf-8?B?UmFnUDNwWnhWaExFSVRqQmZ4MTRKTFZpbE9meWRPK2JKb1RQaitBci9SeEQ4?=
 =?utf-8?B?RG9GZG80R1RlUEF1VUdIc2lwSS9YcWltVEpGUW4wSmJMZHltMStTazd0bEtm?=
 =?utf-8?B?dFdBTXpQN3drdHlkQVJaKy9PckpocEFDUFF2UGdvNWNxcDA1OXJwTjI3SElk?=
 =?utf-8?B?cXI3NWpWUnV5bW4vWTRFOFhWVk9ZemVPQU5XY2VGYUc4QXBKSHQveERUYkw1?=
 =?utf-8?B?WU1MYnhuRzBDVjFOVkhlWjNVVVBuUDc3cHdJYmorMTIwcXE3RGlrcW1YOWox?=
 =?utf-8?B?dnYrYkJMZXBGWUs3RmpzTlVFMDFOeHhHZmlSNkFvUzhSOU02VjI0RjVVUXNX?=
 =?utf-8?B?dXNOV3hkL2YraGcyVDFCNi8rSVNzTGFGRFgydVpMRmRUOFlNRFk5NEFGSGFr?=
 =?utf-8?B?OTJ5UmFmUS9qTHQ5QmNLYzIzeURKZk5teDdPNUlkcUpneitLS0hBczFRKzZO?=
 =?utf-8?B?cHBrMzEzbFl1YXByd2ZqcWdqajE5WjFXWjZXZXFObDVkTEFnV1BwM0x3K2oz?=
 =?utf-8?B?TG44UnQwWGtvNUpNTmhqd2J5TkEzWEdqTjU2Z1JtbXJuQVI0K2ZqeWl3a1VI?=
 =?utf-8?B?RDhhNU5qaEtWTFlRYmIyNnlKMHhYYnlBUnNxTUhVZW1BTXB4V0J0TUdyMExH?=
 =?utf-8?B?ZXFpUXJuaGxCVzJha2p0Z2NBbExQQXNhTlNCK0g4LzhqclVrQjFZa2ZKaDBM?=
 =?utf-8?B?SHRxYytOcE15SWtQYjhTK1g0dklQck9XajBiTzQraUQwemt1L1AvcUVMTldr?=
 =?utf-8?B?NWpTWloxaExRNlJpU3VqZTVWS2hRdEFWekQ1T2pTVkxtT0cyY0x4S3BZSkN5?=
 =?utf-8?B?WUFoVmFibXdtcnJKeEdsVVBUVDhyVTNWWUlyaEU0UVNmc2dhSEVyQXMzQ3Fn?=
 =?utf-8?B?QmlxY3dCNHVrNjN1blFuNmdiVTBQenRWblA4T1BTcjJrSHlVc0krZ1VEbjIr?=
 =?utf-8?B?K2VGemRIbk43dEFEWFBnNGFRRGFxQkx2Vk9qTWtxemJVMlEzMUlvSkw4eml5?=
 =?utf-8?B?Wm1FN0xhUGh1MUJabjQ0YUUzVVVoalJ5M0w0UTdXa1ZlZ0hRU3ZrTHEvc1Yv?=
 =?utf-8?B?aUpReEw5Z0JMdFNrVEh0dUs5dGhIWXJ3YkpHamQwT3BjeXFBRmlwT1dTQURw?=
 =?utf-8?B?SGdlZVNsd3ZGYXVXZXI1em5PRSt5Tmp1Y0hvR0RkV3VzNEtYa1FsZDdHcUt0?=
 =?utf-8?B?WFRpY1RDVDg0QmkzejZhc3NGeXlxOXg4aGp1WitLakptZ2xoa1dXYUJ3Z0JX?=
 =?utf-8?B?K2YxeVVGZ3ZHMjZEcmFYQWtrSzFSblh0Z1R1Nlgvc0NLLzllajZ0N1dhaGNF?=
 =?utf-8?B?S3owTlFjSWNCWVVuSVpoc05aajNZTkhnYVZFK1h6cUhMRnZzWjQrRG1iTFZI?=
 =?utf-8?B?ZmZFWmI4eEVSSTdaZ2d6N3hBNHM1b3N3RUlHRkQ0YTZkU2xpMERKR1NKRU1C?=
 =?utf-8?B?dVY3bW9mZDNhZjkvUGo1T1ladkVUNEgzcXRFVDZ3ZnZwaHFOcHNnREdWUFBJ?=
 =?utf-8?Q?jB6zpG+sEI945dDQFKqGLF1LKb7CTAtSoNZIICy?=
X-MS-Exchange-Transport-CrossTenantHeadersStamped: VE1PR08MB5760
Original-Authentication-Results: gcc.gnu.org; dkim=none (message not signed)
 header.d=none;gcc.gnu.org; dmarc=none action=none header.from=arm.com;
X-EOPAttributedMessage: 0
X-MS-Exchange-Transport-CrossTenantHeadersStripped: VE1EUR03FT039.eop-EUR03.prod.protection.outlook.com
X-MS-Office365-Filtering-Correlation-Id-Prvs: af0d48e4-4cb5-43b9-af73-08d96c836ffe
X-Microsoft-Antispam: BCL:0;
X-Microsoft-Antispam-Message-Info: U3ul/erA08Rl48Zp2LJKH1LpADOgIR8PqjXBtPnGDV2K8LLPEjPwprgOCe/HNPtaKOo/eVSR2nUlD9B3WQuewrBAXgSUTc/vQfWL7lnd2Hz+OPMDYd6WFCBzyCM3hL5zlB2f/iLZ5oRuNiOgujJmDF5PF6FmCzpiz/W6BfEwdOxOj5bQYPnX3jXAKkzfKh2J58Q5+yX89QJs72qmsrOj1Ind2iDVIRi27N2ztPijrIlbWOl+yT0iG/TgMbmv5WvykCvevU+YI58PVTdWX/OwKgCQOlVyEUHaVhlIczDNXqvUi5j7wSYowToiCGgbXokWmARp7PV8kLN8qIYh97dgc82dfzKm+ZB0/AWzMaIX8yuLQmYDedXIFGhlUDOvMc/q5ZQDi+LdWfwSkSKVr4XaSgJuMbM/3ftYiAENYnAFsrTYlqo2SHOj9yZqAw1FAfFoQzDtttOoG0yANeFV20/DUHWgWO8d1y67rcHiBc7C5za36hycKu3lHx2IUcvmWuJUCqTMJ1JyLIOTZGxfBpEl1ePEuEWtVf/jd8+eZCT0VhRxY/Z5SLrc68+e+y1iPUS/qHc6CQXdN+dmeF+YOsW5ChdTH/nEvwchjZVO5s9RmoYrFOquqnZCzwQvlFAjL3DAUcKuyHQfduPYMSUD4IWRgUFMKH3pBCGA6jPUvsEdIXw9LXLFlyU+g6IGaEHN2xnw7C3+Wmc6q4IGg42fnCyijI35716tmtI4vAGPH00WIZfJ2bb/QjBWx+R6ohZPkuj+bQkjn6VltcayYxQqAdPbjWF6nF+GvNGp3BR9L8g9yti8bgKAgwF1zBFglAVa7JMHdYJRCeidEWGfxgt2unUiMA==
X-Forefront-Antispam-Report: CIP:63.35.35.123; CTRY:IE; LANG:en; SCL:1; SRV:;
 IPV:CAL; SFV:NSPM; H:64aa7808-outbound-1.mta.getcheckrecipient.com;
 PTR:ec2-63-35-35-123.eu-west-1.compute.amazonaws.com; CAT:NONE;
 SFS:(4636009)(396003)(376002)(346002)(39860400002)(136003)(46966006)(36840700001)(86362001)(44144004)(6916009)(33964004)(316002)(55016002)(2616005)(26005)(36860700001)(82310400003)(8886007)(186003)(4326008)(36756003)(7696005)(82740400003)(83380400001)(30864003)(478600001)(956004)(44832011)(81166007)(356005)(5660300002)(4743002)(2906002)(235185007)(8936002)(336012)(66616009)(47076005)(8676002)(70206006)(70586007)(4216001)(2700100001);
 DIR:OUT; SFP:1101; 
X-OriginatorOrg: arm.com
X-MS-Exchange-CrossTenant-OriginalArrivalTime: 31 Aug 2021 13:30:18.7817 (UTC)
X-MS-Exchange-CrossTenant-Network-Message-Id: 1f458e7e-6bcf-4aba-1037-08d96c8379c7
X-MS-Exchange-CrossTenant-Id: f34e5979-57d9-4aaa-ad4d-b122a662184d
X-MS-Exchange-CrossTenant-OriginalAttributedTenantConnectingIp: TenantId=f34e5979-57d9-4aaa-ad4d-b122a662184d; Ip=[63.35.35.123];
 Helo=[64aa7808-outbound-1.mta.getcheckrecipient.com]
X-MS-Exchange-CrossTenant-AuthSource: VE1EUR03FT039.eop-EUR03.prod.protection.outlook.com
X-MS-Exchange-CrossTenant-AuthAs: Anonymous
X-MS-Exchange-CrossTenant-FromEntityHeader: HybridOnPrem
X-MS-Exchange-Transport-CrossTenantHeadersStamped: AS8PR08MB7173
X-Spam-Status: No, score=-13.6 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, GIT_PATCH_0, KAM_LOTSOFHASH, KAM_SHORT, MSGID_FROM_MTA_HEADER,
 RCVD_IN_DNSWL_NONE, RCVD_IN_MSPIKE_H2, SPF_HELO_PASS, SPF_PASS, TXREP,
 UNPARSEABLE_RELAY autolearn=ham autolearn_force=no version=3.4.4
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on
 server2.sourceware.org
X-BeenThere: gcc-patches@gcc.gnu.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Gcc-patches mailing list <gcc-patches.gcc.gnu.org>
List-Unsubscribe: <https://gcc.gnu.org/mailman/options/gcc-patches>,
 <mailto:gcc-patches-request@gcc.gnu.org?subject=unsubscribe>
List-Archive: <https://gcc.gnu.org/pipermail/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-request@gcc.gnu.org?subject=help>
List-Subscribe: <https://gcc.gnu.org/mailman/listinfo/gcc-patches>,
 <mailto:gcc-patches-request@gcc.gnu.org?subject=subscribe>
X-List-Received-Date: Tue, 31 Aug 2021 13:30:36 -0000

--J/dobhs11T7y2rNN
Content-Type: text/plain; charset=utf-8
Content-Disposition: inline

Hi All,

This patch adds extended costing to cost the creation of constants and the
manipulation of constants.  The default values provided are based on
architectural expectations and each cost models can be individually tweaked as
needed.

The changes in this patch covers:

* Construction of PARALLEL or CONST_VECTOR:
  Adds better costing for vector of constants which is based on the constant
  being created and the instruction that can be used to create it.  i.e. a movi
  is cheaper than a literal load etc.
* Construction of a vector through a vec_dup.
* Extraction of part of a vector using a vec_select.  In this part we had to
  make some opportunistic assumptions.  In particular we had to model extracting
  of the high-half of a register as being "free" in order to get fusion using
  NEON high-part instructions possible.  In the event that there is no <insn>2
  variant for the instruction the select would still be cheaper than the load.

Unfortunately on AArch64 you need -O3 when using intrinsics for this to kick
in until we fix vld1/2/3 to be gimple instead of RTL intrinsics.

This should also fix the stack allocations.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/arm/aarch-common-protos.h (struct vector_cost_table): Add
	movi, dup and extract costing fields.
	* config/aarch64/aarch64-cost-tables.h (qdf24xx_extra_costs,
	thunderx_extra_costs, thunderx2t99_extra_costs,
	thunderx3t110_extra_costs, tsv110_extra_costs, a64fx_extra_costs): Use
	them.
	* config/arm/aarch-cost-tables.h (generic_extra_costs,
	cortexa53_extra_costs, cortexa57_extra_costs, cortexa76_extra_costs,
	exynosm1_extra_costs, xgene1_extra_costs): Likewise
	* config/aarch64/aarch64-simd.md (aarch64_simd_dup<mode>): Add r->w dup.
	* config/aarch64/aarch64.c (aarch64_simd_make_constant): Expose.
	(aarch64_rtx_costs): Add extra costs.
	(aarch64_simd_dup_constant): Support check only mode.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vect-cse-codegen.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
index dd2e7e7cbb13d24f0b51092270cd7e2d75fabf29..bb499a1eae62a145f1665d521f57c98b49ac5389 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -124,7 +124,10 @@ const struct cpu_cost_table qdf24xx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -229,7 +232,10 @@ const struct cpu_cost_table thunderx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* mult.  */
+    COSTS_N_INSNS (4),	/* mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -333,7 +339,10 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* Mult.  */
+    COSTS_N_INSNS (4),	/* Mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -437,7 +446,10 @@ const struct cpu_cost_table thunderx3t110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* Mult.  */
+    COSTS_N_INSNS (4),	/* Mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -542,7 +554,10 @@ const struct cpu_cost_table tsv110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -646,7 +661,10 @@ const struct cpu_cost_table a64fx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c5638d096fa84a27b4ea397f62cd0d05a28e7c8c..6814dae079c9ff40aaa2bb625432bf9eb8906b73 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -74,12 +74,14 @@ (define_insn "aarch64_simd_dup<mode>"
 )
 
 (define_insn "aarch64_simd_dup<mode>"
-  [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
+  [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
 	(vec_duplicate:VDQF_F16
-	  (match_operand:<VEL> 1 "register_operand" "w")))]
+	  (match_operand:<VEL> 1 "register_operand" "w,r")))]
   "TARGET_SIMD"
-  "dup\\t%0.<Vtype>, %1.<Vetype>[0]"
-  [(set_attr "type" "neon_dup<q>")]
+  "@
+   dup\\t%0.<Vtype>, %1.<Vetype>[0]
+   dup\\t%0.<Vtype>, %<vw>1"
+  [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
 )
 
 (define_insn "aarch64_dup_lane<mode>"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index f80de2ca8971086d6a4bf3aa7793d0cda953b5c8..26d78ffe98a3445dcc490c93849c46a8c2595cf8 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -302,6 +302,7 @@ static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 					    aarch64_addr_query_type);
 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
+static rtx aarch64_simd_make_constant (rtx, bool);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -12665,7 +12666,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
   rtx op0, op1, op2;
   const struct cpu_cost_table *extra_cost
     = aarch64_tune_params.insn_extra_cost;
-  int code = GET_CODE (x);
+  rtx_code code = GET_CODE (x);
   scalar_int_mode int_mode;
 
   /* By default, assume that everything has equivalent cost to the
@@ -13936,8 +13937,65 @@ cost_plus:
 			     mode, MULT, 1, speed);
           return true;
         }
+	break;
+    case PARALLEL:
+      /* Fall through */
+    case CONST_VECTOR:
+	{
+	  rtx gen_insn = aarch64_simd_make_constant (x, true);
+	  /* Not a valid const vector.  */
+	  if (!gen_insn)
+	    break;
 
-      /* Fall through.  */
+	  switch (GET_CODE (gen_insn))
+	  {
+	  case CONST_VECTOR:
+	    /* Load using MOVI/MVNI.  */
+	    if (aarch64_simd_valid_immediate (x, NULL))
+	      *cost += extra_cost->vect.movi;
+	    else /* Load using constant pool.  */
+	      *cost += extra_cost->ldst.load;
+	    break;
+	  /* Load using a DUP.  */
+	  case VEC_DUPLICATE:
+	    *cost += extra_cost->vect.dup;
+	    break;
+	  default:
+	    *cost += extra_cost->ldst.load;
+	    break;
+	  }
+	  return true;
+	}
+    case VEC_CONCAT:
+	/* depending on the operation, either DUP or INS.
+	   For now, keep default costing.  */
+	break;
+    case VEC_DUPLICATE:
+	*cost += extra_cost->vect.dup;
+	return true;
+    case VEC_SELECT:
+	{
+	  /* cost subreg of 0 as free, otherwise as DUP */
+	  rtx op1 = XEXP (x, 1);
+	  int nelts;
+	  if ((op1 == const0_rtx && !BYTES_BIG_ENDIAN)
+	      || (BYTES_BIG_ENDIAN
+		  && GET_MODE_NUNITS (mode).is_constant(&nelts)
+		  && INTVAL (op1) == nelts - 1))
+	    ;
+	  else if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
+	    ;
+	  else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
+	  /* Selecting the high part is not technically free, but we lack
+	     enough information to decide that here.  For instance selecting
+	     the high-part of a vec_dup *is* free or to feed into any _high
+	     instruction.   Both of which we can't really tell.  That said
+	     have a better chance to optimize an dup vs multiple constants.  */
+	    ;
+	  else
+	    *cost += extra_cost->vect.extract;
+	  return true;
+	}
     default:
       break;
     }
@@ -20663,9 +20721,12 @@ aarch64_builtin_support_vector_misalignment (machine_mode mode,
 
 /* If VALS is a vector constant that can be loaded into a register
    using DUP, generate instructions to do so and return an RTX to
-   assign to the register.  Otherwise return NULL_RTX.  */
+   assign to the register.  Otherwise return NULL_RTX.
+
+   If CHECK then the resulting instruction may not be used in
+   codegen but can be used for costing.  */
 static rtx
-aarch64_simd_dup_constant (rtx vals)
+aarch64_simd_dup_constant (rtx vals, bool check = false)
 {
   machine_mode mode = GET_MODE (vals);
   machine_mode inner_mode = GET_MODE_INNER (mode);
@@ -20677,7 +20738,8 @@ aarch64_simd_dup_constant (rtx vals)
   /* We can load this constant by using DUP and a constant in a
      single ARM register.  This will be cheaper than a vector
      load.  */
-  x = copy_to_mode_reg (inner_mode, x);
+  if (!check)
+    x = copy_to_mode_reg (inner_mode, x);
   return gen_vec_duplicate (mode, x);
 }
 
@@ -20685,9 +20747,12 @@ aarch64_simd_dup_constant (rtx vals)
 /* Generate code to load VALS, which is a PARALLEL containing only
    constants (for vec_init) or CONST_VECTOR, efficiently into a
    register.  Returns an RTX to copy into the register, or NULL_RTX
-   for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
+   for a PARALLEL that cannot be converted into a CONST_VECTOR.
+
+   If CHECK then the resulting instruction may not be used in
+   codegen but can be used for costing.  */
 static rtx
-aarch64_simd_make_constant (rtx vals)
+aarch64_simd_make_constant (rtx vals, bool check = false)
 {
   machine_mode mode = GET_MODE (vals);
   rtx const_dup;
@@ -20719,7 +20784,7 @@ aarch64_simd_make_constant (rtx vals)
       && aarch64_simd_valid_immediate (const_vec, NULL))
     /* Load using MOVI/MVNI.  */
     return const_vec;
-  else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
+  else if ((const_dup = aarch64_simd_dup_constant (vals, check)) != NULL_RTX)
     /* Loaded using DUP.  */
     return const_dup;
   else if (const_vec != NULL_RTX)
diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h
index 6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084ec11b468485c1400 100644
--- a/gcc/config/arm/aarch-common-protos.h
+++ b/gcc/config/arm/aarch-common-protos.h
@@ -133,6 +133,9 @@ struct vector_cost_table
 {
   const int alu;
   const int mult;
+  const int movi;
+  const int dup;
+  const int extract;
 };
 
 struct cpu_cost_table
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index 25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294a37945188fb90ef 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (0),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (2),  /* alu.  */
-    COSTS_N_INSNS (8)   /* mult.  */
+    COSTS_N_INSNS (8),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
new file mode 100644
index 0000000000000000000000000000000000000000..36e468aacfadd7701c6a7cd432bee81472111a16
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
@@ -0,0 +1,127 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+**test0:
+**	movi	v2.16b, 0x3
+**	ldr	q0, \[x0\]
+**	uxtl	v1.8h, v0.8b
+**	uxtl2	v0.8h, v0.16b
+**	ldr	q3, \[x1\]
+**	umlal	v1.8h, v3.8b, v2.8b
+**	umlal2	v0.8h, v3.16b, v2.16b
+**	addhn	v0.8b, v1.8h, v0.8h
+**	str	d0, \[x2\]
+**	ret
+*/
+
+void test0 (uint8_t *inptr0, uint8_t *inptr1, uint8_t *outptr0)
+{
+  uint8x16_t three_u8 = vdupq_n_u8(3);
+  uint8x16_t x = vld1q_u8(inptr0);
+  uint8x16_t y = vld1q_u8(inptr1);
+  uint16x8_t x_l = vmovl_u8(vget_low_u8(x));
+  uint16x8_t x_h = vmovl_u8(vget_high_u8(x));
+  uint16x8_t z_l = vmlal_u8(x_l, vget_low_u8(y), vget_low_u8(three_u8));
+  uint16x8_t z_h = vmlal_u8(x_h, vget_high_u8(y), vget_high_u8(three_u8));
+  vst1_u8(outptr0, vaddhn_u16(z_l, z_h));
+}
+
+/*
+**test1:
+**	sub	sp, sp, #16
+**	adrp	x2, .LC0
+**	ldr	q1, \[x2, #:lo12:.LC0\]
+**	add	v0.2d, v1.2d, v0.2d
+**	str	q0, \[x1\]
+**	fmov	x1, d1
+**	orr	x0, x0, x1
+**	add	sp, sp, 16
+**	ret
+*/
+
+uint64_t
+test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
+  uint64_t res = a | arr[0];
+  uint64x2_t val = vld1q_u64 (arr);
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test2:
+**	adrp	x2, .LC1
+**	ldr	q1, \[x2, #:lo12:.LC1\]
+**	add	v0.2d, v0.2d, v1.2d
+**	str	q0, \[x1\]
+**	fmov	x1, d1
+**	orr	x0, x0, x1
+**	ret
+*/
+
+uint64_t
+test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
+  uint64_t arr = vgetq_lane_u64 (val, 0);
+  uint64_t res = a | arr;
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test3:
+**	sub	sp, sp, #16
+**	adrp	x2, .LC2
+**	ldr	q1, \[x2, #:lo12:.LC2\]
+**	add	v0.4s, v1.4s, v0.4s
+**	str	q0, \[x1\]
+**	fmov	w1, s1
+**	orr	w0, w0, w1
+**	add	sp, sp, 16
+**	ret
+*/
+
+uint32_t
+test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
+{
+  uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
+  uint32_t res = a | arr[0];
+  uint32x4_t val = vld1q_u32 (arr);
+  *rt = vaddq_u32 (val, b);
+  return res;
+}
+
+/*
+**test4:
+**	ushr	v0.16b, v0.16b, 7
+**	mov	x0, 16512
+**	movk	x0, 0x1020, lsl 16
+**	movk	x0, 0x408, lsl 32
+**	movk	x0, 0x102, lsl 48
+**	fmov	d1, x0
+**	pmull	v2.1q, v0.1d, v1.1d
+**	dup	v1.2d, v1.d\[0\]
+**	pmull2	v0.1q, v0.2d, v1.2d
+**	trn2	v2.8b, v2.8b, v0.8b
+**	umov	w0, v2.h\[3\]
+**	ret
+*/
+
+uint64_t
+test4 (uint8x16_t input)
+{
+    uint8x16_t bool_input = vshrq_n_u8(input, 7);
+    poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
+    poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
+                               vgetq_lane_p64(mask, 0));
+    poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
+    uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
+    return vget_lane_u16((uint16x4_t)res, 3);
+}
+


-- 

--J/dobhs11T7y2rNN
Content-Type: text/x-diff; charset=utf-8
Content-Disposition: attachment; filename="rb14774.patch"

diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
index dd2e7e7cbb13d24f0b51092270cd7e2d75fabf29..bb499a1eae62a145f1665d521f57c98b49ac5389 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -124,7 +124,10 @@ const struct cpu_cost_table qdf24xx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -229,7 +232,10 @@ const struct cpu_cost_table thunderx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* mult.  */
+    COSTS_N_INSNS (4),	/* mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -333,7 +339,10 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* Mult.  */
+    COSTS_N_INSNS (4),	/* Mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -437,7 +446,10 @@ const struct cpu_cost_table thunderx3t110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* Mult.  */
+    COSTS_N_INSNS (4),	/* Mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -542,7 +554,10 @@ const struct cpu_cost_table tsv110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -646,7 +661,10 @@ const struct cpu_cost_table a64fx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c5638d096fa84a27b4ea397f62cd0d05a28e7c8c..6814dae079c9ff40aaa2bb625432bf9eb8906b73 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -74,12 +74,14 @@ (define_insn "aarch64_simd_dup<mode>"
 )
 
 (define_insn "aarch64_simd_dup<mode>"
-  [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
+  [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
 	(vec_duplicate:VDQF_F16
-	  (match_operand:<VEL> 1 "register_operand" "w")))]
+	  (match_operand:<VEL> 1 "register_operand" "w,r")))]
   "TARGET_SIMD"
-  "dup\\t%0.<Vtype>, %1.<Vetype>[0]"
-  [(set_attr "type" "neon_dup<q>")]
+  "@
+   dup\\t%0.<Vtype>, %1.<Vetype>[0]
+   dup\\t%0.<Vtype>, %<vw>1"
+  [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
 )
 
 (define_insn "aarch64_dup_lane<mode>"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index f80de2ca8971086d6a4bf3aa7793d0cda953b5c8..26d78ffe98a3445dcc490c93849c46a8c2595cf8 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -302,6 +302,7 @@ static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 					    aarch64_addr_query_type);
 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
+static rtx aarch64_simd_make_constant (rtx, bool);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -12665,7 +12666,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
   rtx op0, op1, op2;
   const struct cpu_cost_table *extra_cost
     = aarch64_tune_params.insn_extra_cost;
-  int code = GET_CODE (x);
+  rtx_code code = GET_CODE (x);
   scalar_int_mode int_mode;
 
   /* By default, assume that everything has equivalent cost to the
@@ -13936,8 +13937,65 @@ cost_plus:
 			     mode, MULT, 1, speed);
           return true;
         }
+	break;
+    case PARALLEL:
+      /* Fall through */
+    case CONST_VECTOR:
+	{
+	  rtx gen_insn = aarch64_simd_make_constant (x, true);
+	  /* Not a valid const vector.  */
+	  if (!gen_insn)
+	    break;
 
-      /* Fall through.  */
+	  switch (GET_CODE (gen_insn))
+	  {
+	  case CONST_VECTOR:
+	    /* Load using MOVI/MVNI.  */
+	    if (aarch64_simd_valid_immediate (x, NULL))
+	      *cost += extra_cost->vect.movi;
+	    else /* Load using constant pool.  */
+	      *cost += extra_cost->ldst.load;
+	    break;
+	  /* Load using a DUP.  */
+	  case VEC_DUPLICATE:
+	    *cost += extra_cost->vect.dup;
+	    break;
+	  default:
+	    *cost += extra_cost->ldst.load;
+	    break;
+	  }
+	  return true;
+	}
+    case VEC_CONCAT:
+	/* depending on the operation, either DUP or INS.
+	   For now, keep default costing.  */
+	break;
+    case VEC_DUPLICATE:
+	*cost += extra_cost->vect.dup;
+	return true;
+    case VEC_SELECT:
+	{
+	  /* cost subreg of 0 as free, otherwise as DUP */
+	  rtx op1 = XEXP (x, 1);
+	  int nelts;
+	  if ((op1 == const0_rtx && !BYTES_BIG_ENDIAN)
+	      || (BYTES_BIG_ENDIAN
+		  && GET_MODE_NUNITS (mode).is_constant(&nelts)
+		  && INTVAL (op1) == nelts - 1))
+	    ;
+	  else if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
+	    ;
+	  else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
+	  /* Selecting the high part is not technically free, but we lack
+	     enough information to decide that here.  For instance selecting
+	     the high-part of a vec_dup *is* free or to feed into any _high
+	     instruction.   Both of which we can't really tell.  That said
+	     have a better chance to optimize an dup vs multiple constants.  */
+	    ;
+	  else
+	    *cost += extra_cost->vect.extract;
+	  return true;
+	}
     default:
       break;
     }
@@ -20663,9 +20721,12 @@ aarch64_builtin_support_vector_misalignment (machine_mode mode,
 
 /* If VALS is a vector constant that can be loaded into a register
    using DUP, generate instructions to do so and return an RTX to
-   assign to the register.  Otherwise return NULL_RTX.  */
+   assign to the register.  Otherwise return NULL_RTX.
+
+   If CHECK then the resulting instruction may not be used in
+   codegen but can be used for costing.  */
 static rtx
-aarch64_simd_dup_constant (rtx vals)
+aarch64_simd_dup_constant (rtx vals, bool check = false)
 {
   machine_mode mode = GET_MODE (vals);
   machine_mode inner_mode = GET_MODE_INNER (mode);
@@ -20677,7 +20738,8 @@ aarch64_simd_dup_constant (rtx vals)
   /* We can load this constant by using DUP and a constant in a
      single ARM register.  This will be cheaper than a vector
      load.  */
-  x = copy_to_mode_reg (inner_mode, x);
+  if (!check)
+    x = copy_to_mode_reg (inner_mode, x);
   return gen_vec_duplicate (mode, x);
 }
 
@@ -20685,9 +20747,12 @@ aarch64_simd_dup_constant (rtx vals)
 /* Generate code to load VALS, which is a PARALLEL containing only
    constants (for vec_init) or CONST_VECTOR, efficiently into a
    register.  Returns an RTX to copy into the register, or NULL_RTX
-   for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
+   for a PARALLEL that cannot be converted into a CONST_VECTOR.
+
+   If CHECK then the resulting instruction may not be used in
+   codegen but can be used for costing.  */
 static rtx
-aarch64_simd_make_constant (rtx vals)
+aarch64_simd_make_constant (rtx vals, bool check = false)
 {
   machine_mode mode = GET_MODE (vals);
   rtx const_dup;
@@ -20719,7 +20784,7 @@ aarch64_simd_make_constant (rtx vals)
       && aarch64_simd_valid_immediate (const_vec, NULL))
     /* Load using MOVI/MVNI.  */
     return const_vec;
-  else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
+  else if ((const_dup = aarch64_simd_dup_constant (vals, check)) != NULL_RTX)
     /* Loaded using DUP.  */
     return const_dup;
   else if (const_vec != NULL_RTX)
diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h
index 6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084ec11b468485c1400 100644
--- a/gcc/config/arm/aarch-common-protos.h
+++ b/gcc/config/arm/aarch-common-protos.h
@@ -133,6 +133,9 @@ struct vector_cost_table
 {
   const int alu;
   const int mult;
+  const int movi;
+  const int dup;
+  const int extract;
 };
 
 struct cpu_cost_table
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index 25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294a37945188fb90ef 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (0),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (2),  /* alu.  */
-    COSTS_N_INSNS (8)   /* mult.  */
+    COSTS_N_INSNS (8),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
new file mode 100644
index 0000000000000000000000000000000000000000..36e468aacfadd7701c6a7cd432bee81472111a16
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
@@ -0,0 +1,127 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+**test0:
+**	movi	v2.16b, 0x3
+**	ldr	q0, \[x0\]
+**	uxtl	v1.8h, v0.8b
+**	uxtl2	v0.8h, v0.16b
+**	ldr	q3, \[x1\]
+**	umlal	v1.8h, v3.8b, v2.8b
+**	umlal2	v0.8h, v3.16b, v2.16b
+**	addhn	v0.8b, v1.8h, v0.8h
+**	str	d0, \[x2\]
+**	ret
+*/
+
+void test0 (uint8_t *inptr0, uint8_t *inptr1, uint8_t *outptr0)
+{
+  uint8x16_t three_u8 = vdupq_n_u8(3);
+  uint8x16_t x = vld1q_u8(inptr0);
+  uint8x16_t y = vld1q_u8(inptr1);
+  uint16x8_t x_l = vmovl_u8(vget_low_u8(x));
+  uint16x8_t x_h = vmovl_u8(vget_high_u8(x));
+  uint16x8_t z_l = vmlal_u8(x_l, vget_low_u8(y), vget_low_u8(three_u8));
+  uint16x8_t z_h = vmlal_u8(x_h, vget_high_u8(y), vget_high_u8(three_u8));
+  vst1_u8(outptr0, vaddhn_u16(z_l, z_h));
+}
+
+/*
+**test1:
+**	sub	sp, sp, #16
+**	adrp	x2, .LC0
+**	ldr	q1, \[x2, #:lo12:.LC0\]
+**	add	v0.2d, v1.2d, v0.2d
+**	str	q0, \[x1\]
+**	fmov	x1, d1
+**	orr	x0, x0, x1
+**	add	sp, sp, 16
+**	ret
+*/
+
+uint64_t
+test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
+  uint64_t res = a | arr[0];
+  uint64x2_t val = vld1q_u64 (arr);
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test2:
+**	adrp	x2, .LC1
+**	ldr	q1, \[x2, #:lo12:.LC1\]
+**	add	v0.2d, v0.2d, v1.2d
+**	str	q0, \[x1\]
+**	fmov	x1, d1
+**	orr	x0, x0, x1
+**	ret
+*/
+
+uint64_t
+test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
+  uint64_t arr = vgetq_lane_u64 (val, 0);
+  uint64_t res = a | arr;
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test3:
+**	sub	sp, sp, #16
+**	adrp	x2, .LC2
+**	ldr	q1, \[x2, #:lo12:.LC2\]
+**	add	v0.4s, v1.4s, v0.4s
+**	str	q0, \[x1\]
+**	fmov	w1, s1
+**	orr	w0, w0, w1
+**	add	sp, sp, 16
+**	ret
+*/
+
+uint32_t
+test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
+{
+  uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
+  uint32_t res = a | arr[0];
+  uint32x4_t val = vld1q_u32 (arr);
+  *rt = vaddq_u32 (val, b);
+  return res;
+}
+
+/*
+**test4:
+**	ushr	v0.16b, v0.16b, 7
+**	mov	x0, 16512
+**	movk	x0, 0x1020, lsl 16
+**	movk	x0, 0x408, lsl 32
+**	movk	x0, 0x102, lsl 48
+**	fmov	d1, x0
+**	pmull	v2.1q, v0.1d, v1.1d
+**	dup	v1.2d, v1.d\[0\]
+**	pmull2	v0.1q, v0.2d, v1.2d
+**	trn2	v2.8b, v2.8b, v0.8b
+**	umov	w0, v2.h\[3\]
+**	ret
+*/
+
+uint64_t
+test4 (uint8x16_t input)
+{
+    uint8x16_t bool_input = vshrq_n_u8(input, 7);
+    poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
+    poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
+                               vgetq_lane_p64(mask, 0));
+    poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
+    uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
+    return vget_lane_u16((uint16x4_t)res, 3);
+}
+


--J/dobhs11T7y2rNN--