From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-lf1-x12e.google.com (mail-lf1-x12e.google.com [IPv6:2a00:1450:4864:20::12e]) by sourceware.org (Postfix) with ESMTPS id 9667B385DDF7 for ; Thu, 1 Aug 2024 14:58:29 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 9667B385DDF7 Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=embecosm.com Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=embecosm.com ARC-Filter: OpenARC Filter v1.0.0 sourceware.org 9667B385DDF7 Authentication-Results: server2.sourceware.org; arc=none smtp.remote-ip=2a00:1450:4864:20::12e ARC-Seal: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1722524319; cv=none; b=vm3jr0QvHb/5Qp7MWcIPxS17wDi9aRKpmB5PWpQJqarVmNTzFw14YR2MbPMz/CFEKU/A+8wpDQ2E0uafXw4zi1xhiC9WcfRyLjXFlELG3bAHCiMJ07cvwcaSqUG9x6EQUH4aieaMpyHwtd5z/jD6UqNLUxUA0epHtHNB5TuK0Hs= ARC-Message-Signature: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1722524319; c=relaxed/simple; bh=fKex+axjuzCP+WRKpL7TSoyYtG+RSL2vUCUeCTs4uTI=; h=DKIM-Signature:From:To:Subject:Date:Message-ID:MIME-Version; b=Gwc7VWqIy/D03lRa3qfrxsSO/WcTf4omiGLlFxXGuE4Z82tOMmqaQTIYSShtx6gv7Ez8QEct0N6cbNpnKbUdWaj7vp3nT+DByIG9/Y3xBxjyqgIrMYU0AZTZJ+rEfyzIxI9Qh5j38oie0iU0GQi3W5+mPkuQxt9H6PPC1YWPJ5I= ARC-Authentication-Results: i=1; server2.sourceware.org Received: by mail-lf1-x12e.google.com with SMTP id 2adb3069b0e04-52efba36802so11658155e87.2 for ; Thu, 01 Aug 2024 07:58:29 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=embecosm.com; s=google; t=1722524308; x=1723129108; darn=gcc.gnu.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=ruHsPD22REOyC49EcNuguA0bv5EDoML7F3wFz4W3g60=; b=CJRrN5vfgJEiUsnou+ae99OrRov0Ukrf/qhKPkSN/tqgyeXE/fKenKR1e4VRNNMPUg JMZZL0UuKUsYF2khsRObdoGThwVivPGmM9BuGMAbZHnffLF5fTj44AhfUrQLVQqLtE1x jCIUGHvXAlTzkXUpxVpgIWCTVf3pAS7IBl+R1pZ/muesdBGE74QqPJlxPVFhkAHfW/Jk APhq0VryyZhQ8Tib9bH5Bt1yYsMxOu3TXpF819kMxMfSzDqEOnOQBPjvbGzdwE/vJsg5 fNYbjAOzWoVDeLAraVATX7ZP39hhrcj9HbzBPvaRIicIt0eKznACU0nwrqhzjJcIAOxI Rzpw== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1722524308; x=1723129108; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=ruHsPD22REOyC49EcNuguA0bv5EDoML7F3wFz4W3g60=; b=kTYZ2pBCId1Z18aaVXpnJqzaFecQQKj/2RHYJsvU4oqLp5uQvpT/wwlFID8PNKsZY1 et5r7VhUEjM4Z1i8Jqtths5TKEC/CRh2mP2GRJpakFfb0jY1by56ScTfimWyDHK1P3Eh NWy6xzrpPz572HhrV7uT1HAEwAsG4/LC5Apg3rq7pr6Ad6mPKbPseZs8PHHkwcxFL5IP ipLbm0mk1UxAjLg4eaRt4+sakhTAzAP+RSgWj72IQj7lGWX1LC9RzIy3UDYRwgj7L8yY 1U4CeXUxBJVXsdBB9syuZwIWfok/ZoPHFxfjCeUkx2vI7H9VP8RGuqSIB54VU1tQPNuU 21hA== X-Gm-Message-State: AOJu0YzyIpJyDK181VEy1jY/FfT2Oc7K92jjMAqOU2OCXwxSZKoxTjYj xSNsSJjr0S2Soi26IgRvlI6VU+nQtbz73CxGIvrW6HO3ISYJLX4s37+mZZNMGyV2nD9ep9aulWk XI2Kt X-Google-Smtp-Source: AGHT+IGUC/6pF5oC8/vxzttdSQu2lRe/gkbtuYYM0URRjVpxoJTD2ZPnd/Ho6xANxnLLVfn3aMu0Ug== X-Received: by 2002:a05:6512:12d6:b0:52e:91ff:4709 with SMTP id 2adb3069b0e04-530bb36f2c5mr155293e87.21.1722524307147; Thu, 01 Aug 2024 07:58:27 -0700 (PDT) Received: from platypus.lan ([2a04:cec2:9:dc84:3622:6733:ff49:ee91]) by smtp.gmail.com with ESMTPSA id 4fb4d7f45d1cf-5ac63590592sm10252456a12.25.2024.08.01.07.58.26 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Thu, 01 Aug 2024 07:58:26 -0700 (PDT) From: Arthur Cohen To: gcc-patches@gcc.gnu.org Cc: gcc-rust@gcc.gnu.org, Arthur Cohen Subject: [PATCH 005/125] gccrs: libgrust: Add format_parser library Date: Thu, 1 Aug 2024 16:56:01 +0200 Message-ID: <20240801145809.366388-7-arthur.cohen@embecosm.com> X-Mailer: git-send-email 2.45.2 In-Reply-To: <20240801145809.366388-2-arthur.cohen@embecosm.com> References: <20240801145809.366388-2-arthur.cohen@embecosm.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Spam-Status: No, score=-11.4 required=5.0 tests=BAYES_00,DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,GIT_PATCH_0,KAM_SOMETLD_ARE_BAD_TLD,RCVD_IN_DNSWL_NONE,SPF_HELO_NONE,SPF_PASS,TXREP autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: Compile libformat_parser and link to it. gcc/rust/ChangeLog: * Make-lang.in: Compile libformat_parser. * ast/rust-fmt.cc: New FFI definitions. * ast/rust-fmt.h: Likewise. * expand/rust-macro-builtins.cc (MacroBuiltin::format_args_handler): Call into libformat_parser. * expand/rust-macro-builtins.h: Define format_args!() handler proper. libgrust/ChangeLog: * libformat_parser/Cargo.lock: New file. * libformat_parser/Cargo.toml: New file. * libformat_parser/generic_format_parser/Cargo.toml: New file. * libformat_parser/generic_format_parser/src/lib.rs: New file. * libformat_parser/src/bin.rs: New file. * libformat_parser/src/lib.rs: New file. --- gcc/rust/Make-lang.in | 11 +- gcc/rust/ast/rust-fmt.cc | 77 +- gcc/rust/ast/rust-fmt.h | 189 +-- gcc/rust/expand/rust-macro-builtins.cc | 12 +- gcc/rust/expand/rust-macro-builtins.h | 3 + libgrust/libformat_parser/Cargo.lock | 30 + libgrust/libformat_parser/Cargo.toml | 21 + .../generic_format_parser/Cargo.toml | 9 + .../generic_format_parser/src/lib.rs | 1102 +++++++++++++++++ libgrust/libformat_parser/src/bin.rs | 7 + libgrust/libformat_parser/src/lib.rs | 41 + 11 files changed, 1349 insertions(+), 153 deletions(-) create mode 100644 libgrust/libformat_parser/Cargo.lock create mode 100644 libgrust/libformat_parser/Cargo.toml create mode 100644 libgrust/libformat_parser/generic_format_parser/Cargo.toml create mode 100644 libgrust/libformat_parser/generic_format_parser/src/lib.rs create mode 100644 libgrust/libformat_parser/src/bin.rs create mode 100644 libgrust/libformat_parser/src/lib.rs diff --git a/gcc/rust/Make-lang.in b/gcc/rust/Make-lang.in index c0df49a7fee..8ac0d1d1973 100644 --- a/gcc/rust/Make-lang.in +++ b/gcc/rust/Make-lang.in @@ -54,6 +54,8 @@ GCCRS_D_OBJS = \ rust/rustspec.o \ $(END) +LIBS += -ldl -lpthread + gccrs$(exeext): $(GCCRS_D_OBJS) $(EXTRA_GCC_OBJS) libcommon-target.a $(LIBDEPS) +$(LINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ \ $(GCCRS_D_OBJS) $(EXTRA_GCC_OBJS) libcommon-target.a \ @@ -218,7 +220,7 @@ RUST_LIBDEPS = $(LIBDEPS) $(LIBPROC_MACRO_INTERNAL) crab1$(exeext): $(RUST_ALL_OBJS) attribs.o $(BACKEND) $(RUST_LIBDEPS) $(rust.prev) @$(call LINK_PROGRESS,$(INDEX.rust),start) +$(LLINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ \ - $(RUST_ALL_OBJS) attribs.o $(BACKEND) $(LIBS) $(LIBPROC_MACRO_INTERNAL) $(BACKENDLIBS) + $(RUST_ALL_OBJS) attribs.o $(BACKEND) $(LIBS) $(LIBPROC_MACRO_INTERNAL) rust/libformat_parser.a $(BACKENDLIBS) @$(call LINK_PROGRESS,$(INDEX.rust),end) # Build hooks. @@ -406,6 +408,13 @@ rust/%.o: rust/lex/%.cc $(COMPILE) $(RUST_CXXFLAGS) $(RUST_INCLUDES) $< $(POSTCOMPILE) +%.toml: + echo $@ + +rust/libformat_parser.a: $(srcdir)/../libgrust/libformat_parser/Cargo.toml $(wildcard $(srcdir)/../libgrust/libformat_parser/src/*.rs) + cargo build --manifest-path $(srcdir)/../libgrust/libformat_parser/Cargo.toml --release # FIXME: Not always release, right? + cp $(srcdir)/../libgrust/libformat_parser/target/release/liblibformat_parser.a $@ + # build all rust/parse files in rust folder, add cross-folder includes rust/%.o: rust/parse/%.cc $(COMPILE) $(RUST_CXXFLAGS) $(RUST_INCLUDES) $< diff --git a/gcc/rust/ast/rust-fmt.cc b/gcc/rust/ast/rust-fmt.cc index 9f9ba48f0c3..559b1c8b579 100644 --- a/gcc/rust/ast/rust-fmt.cc +++ b/gcc/rust/ast/rust-fmt.cc @@ -19,78 +19,23 @@ #include "rust-fmt.h" namespace Rust { -tl::expected -Fmt::parse_fmt_string (Fmt::Input input) -{ - return Fmt (); -} +namespace Fmt { -tl::expected>, Fmt::Error> -Fmt::maybe_format (Fmt::Input input) +Pieces +Pieces::collect (const std::string &to_parse) { - tl::optional none = tl::nullopt; + auto piece_slice = collect_pieces (to_parse.c_str ()); - return Fmt::Result (input, none); -} + rust_debug ("[ARTHUR] %p, %lu", (void *) piece_slice.ptr, piece_slice.len); -tl::expected, Fmt::Error> -Fmt::format (Input input) -{ - return Fmt::Result (input, Format ()); -} + // this performs multiple copies, can we avoid them maybe? + auto pieces + = std::vector (piece_slice.ptr, piece_slice.ptr + piece_slice.len); -tl::expected, Fmt::Error> -Fmt::argument (Input input) -{ - return Fmt::Result (input, Argument ()); -} + rust_debug ("[ARTHUR] %p, %lu", (void *) pieces.data (), pieces.size ()); -tl::expected, Fmt::Error> -Fmt::format_spec (Input input) -{ - return Fmt::Result (input, FormatSpec ()); -} - -tl::expected, Fmt::Error> -Fmt::fill (Input input) -{ - return Fmt::Result (input, Fill ()); -} - -tl::expected, Fmt::Error> -Fmt::align (Input input) -{ - switch (input[0]) - { - case '<': - return Fmt::Result (input.substr (1), Align::Left); - case '^': - return Fmt::Result (input.substr (1), Align::Top); - case '>': - return Fmt::Result (input.substr (1), Align::Right); - default: - // TODO: Store the character here - // TODO: Can we have proper error locations? - // TODO: Maybe we should use a Rust::Literal string instead of a string - return tl::make_unexpected (Error::Align); - } -} - -tl::expected, Fmt::Error> -Fmt::sign (Input input) -{ - switch (input[0]) - { - case '+': - return Fmt::Result (input.substr (1), Sign::Plus); - case '-': - return Fmt::Result (input.substr (1), Sign::Minus); - default: - // TODO: Store the character here - // TODO: Can we have proper error locations? - // TODO: Maybe we should use a Rust::Literal string instead of a string - return tl::make_unexpected (Error::Sign); - } + return Pieces{}; } +} // namespace Fmt } // namespace Rust diff --git a/gcc/rust/ast/rust-fmt.h b/gcc/rust/ast/rust-fmt.h index f3dd53da979..0050977358f 100644 --- a/gcc/rust/ast/rust-fmt.h +++ b/gcc/rust/ast/rust-fmt.h @@ -19,115 +19,134 @@ #ifndef RUST_FMT_H #define RUST_FMT_H -#include "expected.h" -#include "optional.h" -#include "rust-ast.h" +#include "rust-diagnostics.h" #include "rust-system.h" namespace Rust { +namespace Fmt { -/** - * This class implements the parsing of Rust format strings according to the - * grammar here: https://doc.rust-lang.org/std/fmt/index.html#syntax - */ -// TODO: Are there features that are only present in specific Rust editions? -class Fmt +struct RustHamster { -public: - // TODO: Keep location information - // TODO: Switch to a Rust::AST::Literal here - using Input = std::string; + // hehe +}; - enum class Error - { - Align, - Sign, - }; +struct InnerSpan +{ +}; - template class Result +struct Count +{ + enum class Kind + { + Is, + IsName, + IsParam, + IsStar, + Implied + } kind; + + union { - public: - explicit Result (Input remaining_input, T result) - : remaining_input (remaining_input), result (result) - {} + size_t is; + std::pair is_name; + size_t is_param; + size_t is_star; + } data; +}; - private: - Input remaining_input; - T result; - }; +struct DebugHex +{ +}; - // FIXME: Do not use an owned string here - static tl::expected parse_fmt_string (Input input); +struct Sign +{ +}; -private: - // the parse functions should return the remaining input as well as the - // expected node let's look at nom - // TODO: no string view :( use an owned string for now? +struct Alignment +{ +}; - template struct ParseResult - { - tl::expected, Error> inner; +struct RustString +{ + // hehe +}; - ParseResult (tl::expected, Error> inner) : inner (inner) {} - ParseResult operator= (tl::expected, Error> inner) - { - return ParseResult (inner); - } +struct Position +{ +}; - Input remaining_input () { return inner->remaining_input; } - T value () { return inner->value; } - }; +struct FormatSpec +{ + /// Optionally specified character to fill alignment with. + tl::optional fill; + /// Span of the optionally specified fill character. + tl::optional fill_span; + /// Optionally specified alignment. + Alignment align; + /// The `+` or `-` flag. + tl::optional sign; + /// The `#` flag. + bool alternate; + /// The `0` flag. + bool zero_pad; + /// The `x` or `X` flag. (Only for `Debug`.) + tl::optional debug_hex; + /// The integer precision to use. + // Count <'a> precision; + /// The span of the precision formatting flag (for diagnostics). + tl::optional precision_span; + /// The string width requested for the resulting format. + // Count <'a> width; + /// The span of the width formatting flag (for diagnostics). + tl::optional width_span; + /// The descriptor string representing the name of the format desired for + /// this argument, this can be empty or any number of characters, although + /// it is required to be one word. + RustHamster ty; + // &'a str ty; + /// The span of the descriptor string (for diagnostics). + tl::optional ty_span; +}; - struct Format - { - }; +struct Argument +{ + Position position; + InnerSpan inner_span; + FormatSpec format; +}; - struct Argument +struct Piece +{ + enum class Kind { - enum struct Kind - { - Integer, - Identifier, - } kind; + String, + NextArgument + } kind; - int integer; - Identifier identifier; - }; - - struct FormatSpec + union { - }; + RustString string; + Argument *next_argument; + } data; +}; - struct Fill - { - char to_fill; - }; +struct PieceSlice +{ + Piece *ptr; + size_t len; +}; - enum class Align - { - Left, - Top, - Right - }; +extern "C" { +PieceSlice +collect_pieces (const char *); +} - enum class Sign - { - Plus, - Minus - }; - - // let's do one function per rule in the BNF - static tl::expected, Error> text (Input input); - static tl::expected>, Error> - maybe_format (Input input); - static tl::expected, Error> format (Input input); - static tl::expected, Error> argument (Input input); - static tl::expected, Error> format_spec (Input input); - static tl::expected, Error> fill (Input input); - static tl::expected, Error> align (Input input); - static tl::expected, Error> sign (Input input); +struct Pieces +{ + static Pieces collect (const std::string &to_parse); }; +} // namespace Fmt } // namespace Rust #endif // ! RUST_FMT_H diff --git a/gcc/rust/expand/rust-macro-builtins.cc b/gcc/rust/expand/rust-macro-builtins.cc index 71da575563d..0e57406f10f 100644 --- a/gcc/rust/expand/rust-macro-builtins.cc +++ b/gcc/rust/expand/rust-macro-builtins.cc @@ -30,6 +30,7 @@ #include "rust-parse.h" #include "rust-session-manager.h" #include "rust-attribute-values.h" +#include "rust-fmt.h" namespace Rust { @@ -89,8 +90,8 @@ std::unordered_map {"env", MacroBuiltin::env_handler}, {"cfg", MacroBuiltin::cfg_handler}, {"include", MacroBuiltin::include_handler}, + {"format_args", MacroBuiltin::format_args_handler}, /* Unimplemented macro builtins */ - {"format_args", MacroBuiltin::sorry}, {"option_env", MacroBuiltin::sorry}, {"format_args_nl", MacroBuiltin::sorry}, {"concat_idents", MacroBuiltin::sorry}, @@ -942,6 +943,15 @@ MacroBuiltin::stringify_handler (location_t invoc_locus, return AST::Fragment ({node}, std::move (token)); } +tl::optional +MacroBuiltin::format_args_handler (location_t invoc_locus, + AST::MacroInvocData &invoc) +{ + Fmt::Pieces::collect ("heyo this {is} what I {} want to {3}, {parse}"); + + return AST::Fragment::create_empty (); +} + tl::optional MacroBuiltin::sorry (location_t invoc_locus, AST::MacroInvocData &invoc) { diff --git a/gcc/rust/expand/rust-macro-builtins.h b/gcc/rust/expand/rust-macro-builtins.h index 6a84a8b86f6..f9ab3fc3698 100644 --- a/gcc/rust/expand/rust-macro-builtins.h +++ b/gcc/rust/expand/rust-macro-builtins.h @@ -157,6 +157,9 @@ public: static tl::optional line_handler (location_t invoc_locus, AST::MacroInvocData &invoc); + static tl::optional + format_args_handler (location_t invoc_locus, AST::MacroInvocData &invoc); + static tl::optional sorry (location_t invoc_locus, AST::MacroInvocData &invoc); diff --git a/libgrust/libformat_parser/Cargo.lock b/libgrust/libformat_parser/Cargo.lock new file mode 100644 index 00000000000..65e48263c71 --- /dev/null +++ b/libgrust/libformat_parser/Cargo.lock @@ -0,0 +1,30 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "generic_format_parser" +version = "0.1.0" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "libc" +version = "0.2.152" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7" + +[[package]] +name = "libformat_parser" +version = "0.1.0" +dependencies = [ + "generic_format_parser", + "libc", +] + +[[package]] +name = "unicode-xid" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" diff --git a/libgrust/libformat_parser/Cargo.toml b/libgrust/libformat_parser/Cargo.toml new file mode 100644 index 00000000000..0fcfa3e89a4 --- /dev/null +++ b/libgrust/libformat_parser/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "libformat_parser" +version = "0.1.0" +edition = "2021" + +[workspace] + +members = [ + "generic_format_parser", +] + +[dependencies] +libc = "0.2" +generic_format_parser = { path = "generic_format_parser" } + +[lib] +crate_type = ["staticlib", "rlib"] + +[[bin]] +name = "format_parser_test" +path = "src/bin.rs" diff --git a/libgrust/libformat_parser/generic_format_parser/Cargo.toml b/libgrust/libformat_parser/generic_format_parser/Cargo.toml new file mode 100644 index 00000000000..34577038cbe --- /dev/null +++ b/libgrust/libformat_parser/generic_format_parser/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "generic_format_parser" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +unicode-xid = "0.2.0" diff --git a/libgrust/libformat_parser/generic_format_parser/src/lib.rs b/libgrust/libformat_parser/generic_format_parser/src/lib.rs new file mode 100644 index 00000000000..f42c9d8dffb --- /dev/null +++ b/libgrust/libformat_parser/generic_format_parser/src/lib.rs @@ -0,0 +1,1102 @@ +//! Macro support for format strings +//! +//! These structures are used when parsing format strings for the compiler. +//! Parsing does not happen at runtime: structures of `std::fmt::rt` are +//! generated instead. + +#![doc( + html_root_url = "https://doc.rust-lang.org/nightly/nightly-rustc/", + html_playground_url = "https://play.rust-lang.org/", + test(attr(deny(warnings))) +)] +#![deny(rustc::untranslatable_diagnostic)] +#![deny(rustc::diagnostic_outside_of_impl)] +// WARNING: We want to be able to build this crate with a stable compiler, +// so no `#![feature]` attributes should be added! + +#[deprecated(note = "Use a proper lexer function for this")] +fn is_id_start(c: char) -> bool { + c == '_' || unicode_xid::UnicodeXID::is_xid_start(c) +} + +#[deprecated(note = "Use a proper lexer function for this")] +fn is_id_continue(c: char) -> bool { + unicode_xid::UnicodeXID::is_xid_continue(c) +} + +// use rustc_lexer::unescape; +pub use Alignment::*; +pub use Count::*; +pub use Piece::*; +pub use Position::*; + +use std::iter; +use std::str; +use std::string; + +// Note: copied from rustc_span +/// Range inside of a `Span` used for diagnostics when we only have access to relative positions. +#[derive(Copy, Clone, PartialEq, Eq, Debug)] +pub struct InnerSpan { + pub start: usize, + pub end: usize, +} + +impl InnerSpan { + pub fn new(start: usize, end: usize) -> InnerSpan { + InnerSpan { start, end } + } +} + +/// The location and before/after width of a character whose width has changed from its source code +/// representation +#[derive(Copy, Clone, PartialEq, Eq)] +pub struct InnerWidthMapping { + /// Index of the character in the source + pub position: usize, + /// The inner width in characters + pub before: usize, + /// The transformed width in characters + pub after: usize, +} + +impl InnerWidthMapping { + pub fn new(position: usize, before: usize, after: usize) -> InnerWidthMapping { + InnerWidthMapping { + position, + before, + after, + } + } +} + +/// Whether the input string is a literal. If yes, it contains the inner width mappings. +#[derive(Clone, PartialEq, Eq)] +enum InputStringKind { + NotALiteral, + Literal { + width_mappings: Vec, + }, +} + +/// The type of format string that we are parsing. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum ParseMode { + /// A normal format string as per `format_args!`. + Format, + /// An inline assembly template string for `asm!`. + InlineAsm, +} + +#[derive(Copy, Clone)] +struct InnerOffset(usize); + +impl InnerOffset { + fn to(self, end: InnerOffset) -> InnerSpan { + InnerSpan::new(self.0, end.0) + } +} + +/// A piece is a portion of the format string which represents the next part +/// to emit. These are emitted as a stream by the `Parser` class. +#[derive(Clone, Debug, PartialEq)] +pub enum Piece<'a> { + /// A literal string which should directly be emitted + String(&'a str), + /// This describes that formatting should process the next argument (as + /// specified inside) for emission. + NextArgument(Box>), +} + +/// Representation of an argument specification. +#[derive(Copy, Clone, Debug, PartialEq)] +pub struct Argument<'a> { + /// Where to find this argument + pub position: Position<'a>, + /// The span of the position indicator. Includes any whitespace in implicit + /// positions (`{ }`). + pub position_span: InnerSpan, + /// How to format the argument + pub format: FormatSpec<'a>, +} + +/// Specification for the formatting of an argument in the format string. +#[derive(Copy, Clone, Debug, PartialEq)] +pub struct FormatSpec<'a> { + /// Optionally specified character to fill alignment with. + pub fill: Option, + /// Span of the optionally specified fill character. + pub fill_span: Option, + /// Optionally specified alignment. + pub align: Alignment, + /// The `+` or `-` flag. + pub sign: Option, + /// The `#` flag. + pub alternate: bool, + /// The `0` flag. + pub zero_pad: bool, + /// The `x` or `X` flag. (Only for `Debug`.) + pub debug_hex: Option, + /// The integer precision to use. + pub precision: Count<'a>, + /// The span of the precision formatting flag (for diagnostics). + pub precision_span: Option, + /// The string width requested for the resulting format. + pub width: Count<'a>, + /// The span of the width formatting flag (for diagnostics). + pub width_span: Option, + /// The descriptor string representing the name of the format desired for + /// this argument, this can be empty or any number of characters, although + /// it is required to be one word. + pub ty: &'a str, + /// The span of the descriptor string (for diagnostics). + pub ty_span: Option, +} + +/// Enum describing where an argument for a format can be located. +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum Position<'a> { + /// The argument is implied to be located at an index + ArgumentImplicitlyIs(usize), + /// The argument is located at a specific index given in the format, + ArgumentIs(usize), + /// The argument has a name. + ArgumentNamed(&'a str), +} + +impl Position<'_> { + pub fn index(&self) -> Option { + match self { + ArgumentIs(i, ..) | ArgumentImplicitlyIs(i) => Some(*i), + _ => None, + } + } +} + +/// Enum of alignments which are supported. +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum Alignment { + /// The value will be aligned to the left. + AlignLeft, + /// The value will be aligned to the right. + AlignRight, + /// The value will be aligned in the center. + AlignCenter, + /// The value will take on a default alignment. + AlignUnknown, +} + +/// Enum for the sign flags. +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum Sign { + /// The `+` flag. + Plus, + /// The `-` flag. + Minus, +} + +/// Enum for the debug hex flags. +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum DebugHex { + /// The `x` flag in `{:x?}`. + Lower, + /// The `X` flag in `{:X?}`. + Upper, +} + +/// A count is used for the precision and width parameters of an integer, and +/// can reference either an argument or a literal integer. +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum Count<'a> { + /// The count is specified explicitly. + CountIs(usize), + /// The count is specified by the argument with the given name. + CountIsName(&'a str, InnerSpan), + /// The count is specified by the argument at the given index. + CountIsParam(usize), + /// The count is specified by a star (like in `{:.*}`) that refers to the argument at the given index. + CountIsStar(usize), + /// The count is implied and cannot be explicitly specified. + CountImplied, +} + +pub struct ParseError { + pub description: string::String, + pub note: Option, + pub label: string::String, + pub span: InnerSpan, + pub secondary_label: Option<(string::String, InnerSpan)>, + pub suggestion: Suggestion, +} + +pub enum Suggestion { + None, + /// Replace inline argument with positional argument: + /// `format!("{foo.bar}")` -> `format!("{}", foo.bar)` + UsePositional, + /// Remove `r#` from identifier: + /// `format!("{r#foo}")` -> `format!("{foo}")` + RemoveRawIdent(InnerSpan), +} + +/// The parser structure for interpreting the input format string. This is +/// modeled as an iterator over `Piece` structures to form a stream of tokens +/// being output. +/// +/// This is a recursive-descent parser for the sake of simplicity, and if +/// necessary there's probably lots of room for improvement performance-wise. +pub struct Parser<'a> { + mode: ParseMode, + input: &'a str, + cur: iter::Peekable>, + /// Error messages accumulated during parsing + pub errors: Vec, + /// Current position of implicit positional argument pointer + pub curarg: usize, + /// `Some(raw count)` when the string is "raw", used to position spans correctly + style: Option, + /// Start and end byte offset of every successfully parsed argument + pub arg_places: Vec, + /// Characters whose length has been changed from their in-code representation + width_map: Vec, + /// Span of the last opening brace seen, used for error reporting + last_opening_brace: Option, + /// Whether the source string is comes from `println!` as opposed to `format!` or `print!` + append_newline: bool, + /// Whether this formatting string was written directly in the source. This controls whether we + /// can use spans to refer into it and give better error messages. + /// N.B: This does _not_ control whether implicit argument captures can be used. + pub is_source_literal: bool, + /// Start position of the current line. + cur_line_start: usize, + /// Start and end byte offset of every line of the format string. Excludes + /// newline characters and leading whitespace. + pub line_spans: Vec, +} + +impl<'a> Iterator for Parser<'a> { + type Item = Piece<'a>; + + fn next(&mut self) -> Option> { + if let Some(&(pos, c)) = self.cur.peek() { + match c { + '{' => { + let curr_last_brace = self.last_opening_brace; + let byte_pos = self.to_span_index(pos); + let lbrace_end = InnerOffset(byte_pos.0 + self.to_span_width(pos)); + self.last_opening_brace = Some(byte_pos.to(lbrace_end)); + self.cur.next(); + if self.consume('{') { + self.last_opening_brace = curr_last_brace; + + Some(String(self.string(pos + 1))) + } else { + let arg = self.argument(lbrace_end); + if let Some(rbrace_pos) = self.consume_closing_brace(&arg) { + if self.is_source_literal { + let lbrace_byte_pos = self.to_span_index(pos); + let rbrace_byte_pos = self.to_span_index(rbrace_pos); + + let width = self.to_span_width(rbrace_pos); + + self.arg_places.push( + lbrace_byte_pos.to(InnerOffset(rbrace_byte_pos.0 + width)), + ); + } + } else { + if let Some(&(_, maybe)) = self.cur.peek() { + if maybe == '?' { + self.suggest_format(); + } else { + self.suggest_positional_arg_instead_of_captured_arg(arg); + } + } + } + Some(NextArgument(Box::new(arg))) + } + } + '}' => { + self.cur.next(); + if self.consume('}') { + Some(String(self.string(pos + 1))) + } else { + let err_pos = self.to_span_index(pos); + self.err_with_note( + "unmatched `}` found", + "unmatched `}`", + "if you intended to print `}`, you can escape it using `}}`", + err_pos.to(err_pos), + ); + None + } + } + _ => Some(String(self.string(pos))), + } + } else { + if self.is_source_literal { + let span = self.span(self.cur_line_start, self.input.len()); + if self.line_spans.last() != Some(&span) { + self.line_spans.push(span); + } + } + None + } + } +} + +impl<'a> Parser<'a> { + /// Creates a new parser for the given format string + pub fn new( + s: &'a str, + style: Option, + snippet: Option, + append_newline: bool, + mode: ParseMode, + ) -> Parser<'a> { + let input_string_kind = find_width_map_from_snippet(s, snippet, style); + let (width_map, is_source_literal) = match input_string_kind { + InputStringKind::Literal { width_mappings } => (width_mappings, true), + InputStringKind::NotALiteral => (Vec::new(), false), + }; + + Parser { + mode, + input: s, + cur: s.char_indices().peekable(), + errors: vec![], + curarg: 0, + style, + arg_places: vec![], + width_map, + last_opening_brace: None, + append_newline, + is_source_literal, + cur_line_start: 0, + line_spans: vec![], + } + } + + /// Notifies of an error. The message doesn't actually need to be of type + /// String, but I think it does when this eventually uses conditions so it + /// might as well start using it now. + fn err, S2: Into>( + &mut self, + description: S1, + label: S2, + span: InnerSpan, + ) { + self.errors.push(ParseError { + description: description.into(), + note: None, + label: label.into(), + span, + secondary_label: None, + suggestion: Suggestion::None, + }); + } + + /// Notifies of an error. The message doesn't actually need to be of type + /// String, but I think it does when this eventually uses conditions so it + /// might as well start using it now. + fn err_with_note< + S1: Into, + S2: Into, + S3: Into, + >( + &mut self, + description: S1, + label: S2, + note: S3, + span: InnerSpan, + ) { + self.errors.push(ParseError { + description: description.into(), + note: Some(note.into()), + label: label.into(), + span, + secondary_label: None, + suggestion: Suggestion::None, + }); + } + + /// Optionally consumes the specified character. If the character is not at + /// the current position, then the current iterator isn't moved and `false` is + /// returned, otherwise the character is consumed and `true` is returned. + fn consume(&mut self, c: char) -> bool { + self.consume_pos(c).is_some() + } + + /// Optionally consumes the specified character. If the character is not at + /// the current position, then the current iterator isn't moved and `None` is + /// returned, otherwise the character is consumed and the current position is + /// returned. + fn consume_pos(&mut self, c: char) -> Option { + if let Some(&(pos, maybe)) = self.cur.peek() { + if c == maybe { + self.cur.next(); + return Some(pos); + } + } + None + } + + fn remap_pos(&self, mut pos: usize) -> InnerOffset { + for width in &self.width_map { + if pos > width.position { + pos += width.before - width.after; + } else if pos == width.position && width.after == 0 { + pos += width.before; + } else { + break; + } + } + + InnerOffset(pos) + } + + fn to_span_index(&self, pos: usize) -> InnerOffset { + // This handles the raw string case, the raw argument is the number of # + // in r###"..."### (we need to add one because of the `r`). + let raw = self.style.map_or(0, |raw| raw + 1); + let pos = self.remap_pos(pos); + InnerOffset(raw + pos.0 + 1) + } + + fn to_span_width(&self, pos: usize) -> usize { + let pos = self.remap_pos(pos); + match self.width_map.iter().find(|w| w.position == pos.0) { + Some(w) => w.before, + None => 1, + } + } + + fn span(&self, start_pos: usize, end_pos: usize) -> InnerSpan { + let start = self.to_span_index(start_pos); + let end = self.to_span_index(end_pos); + start.to(end) + } + + /// Forces consumption of the specified character. If the character is not + /// found, an error is emitted. + fn consume_closing_brace(&mut self, arg: &Argument<'_>) -> Option { + self.ws(); + + let pos; + let description; + + if let Some(&(peek_pos, maybe)) = self.cur.peek() { + if maybe == '}' { + self.cur.next(); + return Some(peek_pos); + } + + pos = peek_pos; + description = format!("expected `'}}'`, found `{maybe:?}`"); + } else { + description = "expected `'}'` but string was terminated".to_owned(); + // point at closing `"` + pos = self.input.len() - if self.append_newline { 1 } else { 0 }; + } + + let pos = self.to_span_index(pos); + + let label = "expected `'}'`".to_owned(); + let (note, secondary_label) = if arg.format.fill == Some('}') { + ( + Some("the character `'}'` is interpreted as a fill character because of the `:` that precedes it".to_owned()), + arg.format.fill_span.map(|sp| ("this is not interpreted as a formatting closing brace".to_owned(), sp)), + ) + } else { + ( + Some("if you intended to print `{`, you can escape it using `{{`".to_owned()), + self.last_opening_brace + .map(|sp| ("because of this opening brace".to_owned(), sp)), + ) + }; + + self.errors.push(ParseError { + description, + note, + label, + span: pos.to(pos), + secondary_label, + suggestion: Suggestion::None, + }); + + None + } + + /// Consumes all whitespace characters until the first non-whitespace character + fn ws(&mut self) { + while let Some(&(_, c)) = self.cur.peek() { + if c.is_whitespace() { + self.cur.next(); + } else { + break; + } + } + } + + /// Parses all of a string which is to be considered a "raw literal" in a + /// format string. This is everything outside of the braces. + fn string(&mut self, start: usize) -> &'a str { + // we may not consume the character, peek the iterator + while let Some(&(pos, c)) = self.cur.peek() { + match c { + '{' | '}' => { + return &self.input[start..pos]; + } + '\n' if self.is_source_literal => { + self.line_spans.push(self.span(self.cur_line_start, pos)); + self.cur_line_start = pos + 1; + self.cur.next(); + } + _ => { + if self.is_source_literal && pos == self.cur_line_start && c.is_whitespace() { + self.cur_line_start = pos + c.len_utf8(); + } + self.cur.next(); + } + } + } + &self.input[start..self.input.len()] + } + + /// Parses an `Argument` structure, or what's contained within braces inside the format string. + fn argument(&mut self, start: InnerOffset) -> Argument<'a> { + let pos = self.position(); + + let end = self + .cur + .clone() + .find(|(_, ch)| !ch.is_whitespace()) + .map_or(start, |(end, _)| self.to_span_index(end)); + let position_span = start.to(end); + + let format = match self.mode { + ParseMode::Format => self.format(), + ParseMode::InlineAsm => self.inline_asm(), + }; + + // Resolve position after parsing format spec. + let pos = match pos { + Some(position) => position, + None => { + let i = self.curarg; + self.curarg += 1; + ArgumentImplicitlyIs(i) + } + }; + + Argument { + position: pos, + position_span, + format, + } + } + + /// Parses a positional argument for a format. This could either be an + /// integer index of an argument, a named argument, or a blank string. + /// Returns `Some(parsed_position)` if the position is not implicitly + /// consuming a macro argument, `None` if it's the case. + fn position(&mut self) -> Option> { + if let Some(i) = self.integer() { + Some(ArgumentIs(i)) + } else { + match self.cur.peek() { + Some(&(lo, c)) if is_id_start(c) => { + let word = self.word(); + + // Recover from `r#ident` in format strings. + // FIXME: use a let chain + if word == "r" { + if let Some((pos, '#')) = self.cur.peek() { + if self.input[pos + 1..] + .chars() + .next() + .is_some_and(is_id_start) + { + self.cur.next(); + let word = self.word(); + let prefix_span = self.span(lo, lo + 2); + let full_span = self.span(lo, lo + 2 + word.len()); + self.errors.insert(0, ParseError { + description: "raw identifiers are not supported".to_owned(), + note: Some("identifiers in format strings can be keywords and don't need to be prefixed with `r#`".to_string()), + label: "raw identifier used here".to_owned(), + span: full_span, + secondary_label: None, + suggestion: Suggestion::RemoveRawIdent(prefix_span), + }); + return Some(ArgumentNamed(word)); + } + } + } + + Some(ArgumentNamed(word)) + } + + // This is an `ArgumentNext`. + // Record the fact and do the resolution after parsing the + // format spec, to make things like `{:.*}` work. + _ => None, + } + } + } + + fn current_pos(&mut self) -> usize { + if let Some(&(pos, _)) = self.cur.peek() { + pos + } else { + self.input.len() + } + } + + /// Parses a format specifier at the current position, returning all of the + /// relevant information in the `FormatSpec` struct. + fn format(&mut self) -> FormatSpec<'a> { + let mut spec = FormatSpec { + fill: None, + fill_span: None, + align: AlignUnknown, + sign: None, + alternate: false, + zero_pad: false, + debug_hex: None, + precision: CountImplied, + precision_span: None, + width: CountImplied, + width_span: None, + ty: &self.input[..0], + ty_span: None, + }; + if !self.consume(':') { + return spec; + } + + // fill character + if let Some(&(idx, c)) = self.cur.peek() { + if let Some((_, '>' | '<' | '^')) = self.cur.clone().nth(1) { + spec.fill = Some(c); + spec.fill_span = Some(self.span(idx, idx + 1)); + self.cur.next(); + } + } + // Alignment + if self.consume('<') { + spec.align = AlignLeft; + } else if self.consume('>') { + spec.align = AlignRight; + } else if self.consume('^') { + spec.align = AlignCenter; + } + // Sign flags + if self.consume('+') { + spec.sign = Some(Sign::Plus); + } else if self.consume('-') { + spec.sign = Some(Sign::Minus); + } + // Alternate marker + if self.consume('#') { + spec.alternate = true; + } + // Width and precision + let mut havewidth = false; + + if self.consume('0') { + // small ambiguity with '0$' as a format string. In theory this is a + // '0' flag and then an ill-formatted format string with just a '$' + // and no count, but this is better if we instead interpret this as + // no '0' flag and '0$' as the width instead. + if let Some(end) = self.consume_pos('$') { + spec.width = CountIsParam(0); + spec.width_span = Some(self.span(end - 1, end + 1)); + havewidth = true; + } else { + spec.zero_pad = true; + } + } + + if !havewidth { + let start = self.current_pos(); + spec.width = self.count(start); + if spec.width != CountImplied { + let end = self.current_pos(); + spec.width_span = Some(self.span(start, end)); + } + } + + if let Some(start) = self.consume_pos('.') { + if self.consume('*') { + // Resolve `CountIsNextParam`. + // We can do this immediately as `position` is resolved later. + let i = self.curarg; + self.curarg += 1; + spec.precision = CountIsStar(i); + } else { + spec.precision = self.count(start + 1); + } + let end = self.current_pos(); + spec.precision_span = Some(self.span(start, end)); + } + + let ty_span_start = self.current_pos(); + // Optional radix followed by the actual format specifier + if self.consume('x') { + if self.consume('?') { + spec.debug_hex = Some(DebugHex::Lower); + spec.ty = "?"; + } else { + spec.ty = "x"; + } + } else if self.consume('X') { + if self.consume('?') { + spec.debug_hex = Some(DebugHex::Upper); + spec.ty = "?"; + } else { + spec.ty = "X"; + } + } else if self.consume('?') { + spec.ty = "?"; + } else { + spec.ty = self.word(); + if !spec.ty.is_empty() { + let ty_span_end = self.current_pos(); + spec.ty_span = Some(self.span(ty_span_start, ty_span_end)); + } + } + spec + } + + /// Parses an inline assembly template modifier at the current position, returning the modifier + /// in the `ty` field of the `FormatSpec` struct. + fn inline_asm(&mut self) -> FormatSpec<'a> { + let mut spec = FormatSpec { + fill: None, + fill_span: None, + align: AlignUnknown, + sign: None, + alternate: false, + zero_pad: false, + debug_hex: None, + precision: CountImplied, + precision_span: None, + width: CountImplied, + width_span: None, + ty: &self.input[..0], + ty_span: None, + }; + if !self.consume(':') { + return spec; + } + + let ty_span_start = self.current_pos(); + spec.ty = self.word(); + if !spec.ty.is_empty() { + let ty_span_end = self.current_pos(); + spec.ty_span = Some(self.span(ty_span_start, ty_span_end)); + } + + spec + } + + /// Parses a `Count` parameter at the current position. This does not check + /// for 'CountIsNextParam' because that is only used in precision, not + /// width. + fn count(&mut self, start: usize) -> Count<'a> { + if let Some(i) = self.integer() { + if self.consume('$') { + CountIsParam(i) + } else { + CountIs(i) + } + } else { + let tmp = self.cur.clone(); + let word = self.word(); + if word.is_empty() { + self.cur = tmp; + CountImplied + } else if let Some(end) = self.consume_pos('$') { + let name_span = self.span(start, end); + CountIsName(word, name_span) + } else { + self.cur = tmp; + CountImplied + } + } + } + + /// Parses a word starting at the current position. A word is the same as + /// Rust identifier, except that it can't start with `_` character. + fn word(&mut self) -> &'a str { + let start = match self.cur.peek() { + Some(&(pos, c)) if is_id_start(c) => { + self.cur.next(); + pos + } + _ => { + return ""; + } + }; + let mut end = None; + while let Some(&(pos, c)) = self.cur.peek() { + if is_id_continue(c) { + self.cur.next(); + } else { + end = Some(pos); + break; + } + } + let end = end.unwrap_or(self.input.len()); + let word = &self.input[start..end]; + if word == "_" { + self.err_with_note( + "invalid argument name `_`", + "invalid argument name", + "argument name cannot be a single underscore", + self.span(start, end), + ); + } + word + } + + fn integer(&mut self) -> Option { + let mut cur: usize = 0; + let mut found = false; + let mut overflow = false; + let start = self.current_pos(); + while let Some(&(_, c)) = self.cur.peek() { + if let Some(i) = c.to_digit(10) { + let (tmp, mul_overflow) = cur.overflowing_mul(10); + let (tmp, add_overflow) = tmp.overflowing_add(i as usize); + if mul_overflow || add_overflow { + overflow = true; + } + cur = tmp; + found = true; + self.cur.next(); + } else { + break; + } + } + + if overflow { + let end = self.current_pos(); + let overflowed_int = &self.input[start..end]; + self.err( + format!( + "integer `{}` does not fit into the type `usize` whose range is `0..={}`", + overflowed_int, + usize::MAX + ), + "integer out of range for `usize`", + self.span(start, end), + ); + } + + found.then_some(cur) + } + + fn suggest_format(&mut self) { + if let (Some(pos), Some(_)) = (self.consume_pos('?'), self.consume_pos(':')) { + let word = self.word(); + let _end = self.current_pos(); + let pos = self.to_span_index(pos); + self.errors.insert( + 0, + ParseError { + description: "expected format parameter to occur after `:`".to_owned(), + note: Some(format!( + "`?` comes after `:`, try `{}:{}` instead", + word, "?" + )), + label: "expected `?` to occur after `:`".to_owned(), + span: pos.to(pos), + secondary_label: None, + suggestion: Suggestion::None, + }, + ); + } + } + + fn suggest_positional_arg_instead_of_captured_arg(&mut self, arg: Argument<'a>) { + if let Some(end) = self.consume_pos('.') { + let byte_pos = self.to_span_index(end); + let start = InnerOffset(byte_pos.0 + 1); + let field = self.argument(start); + // We can only parse `foo.bar` field access, any deeper nesting, + // or another type of expression, like method calls, are not supported + if !self.consume('}') { + return; + } + if let ArgumentNamed(_) = arg.position { + if let ArgumentNamed(_) = field.position { + self.errors.insert( + 0, + ParseError { + description: "field access isn't supported".to_string(), + note: None, + label: "not supported".to_string(), + span: InnerSpan::new(arg.position_span.start, field.position_span.end), + secondary_label: None, + suggestion: Suggestion::UsePositional, + }, + ); + } + } + } + } +} + +/// Finds the indices of all characters that have been processed and differ between the actual +/// written code (code snippet) and the `InternedString` that gets processed in the `Parser` +/// in order to properly synthesise the intra-string `Span`s for error diagnostics. +// TODO: Can we give an escaped string here? probably yes - and a valid one too +fn find_width_map_from_snippet( + input: &str, + snippet: Option, + str_style: Option, +) -> InputStringKind { + let snippet = match snippet { + Some(ref s) if s.starts_with('"') || s.starts_with("r\"") || s.starts_with("r#") => s, + _ => return InputStringKind::NotALiteral, + }; + + if str_style.is_some() { + return InputStringKind::Literal { + width_mappings: Vec::new(), + }; + } + + // Strip quotes. + let snippet = &snippet[1..snippet.len() - 1]; + + // Macros like `println` add a newline at the end. That technically doesn't make them "literals" anymore, but it's fine + // since we will never need to point our spans there, so we lie about it here by ignoring it. + // Since there might actually be newlines in the source code, we need to normalize away all trailing newlines. + // If we only trimmed it off the input, `format!("\n")` would cause a mismatch as here we they actually match up. + // Alternatively, we could just count the trailing newlines and only trim one from the input if they don't match up. + let input_no_nl = input.trim_end_matches('\n'); + let Some(unescaped) = unescape_string(snippet) else { + return InputStringKind::NotALiteral; + }; + + let unescaped_no_nl = unescaped.trim_end_matches('\n'); + + if unescaped_no_nl != input_no_nl { + // The source string that we're pointing at isn't our input, so spans pointing at it will be incorrect. + // This can for example happen with proc macros that respan generated literals. + return InputStringKind::NotALiteral; + } + + let mut s = snippet.char_indices(); + let mut width_mappings = vec![]; + while let Some((pos, c)) = s.next() { + match (c, s.clone().next()) { + // skip whitespace and empty lines ending in '\\' + ('\\', Some((_, '\n'))) => { + let _ = s.next(); + let mut width = 2; + + while let Some((_, c)) = s.clone().next() { + if matches!(c, ' ' | '\n' | '\t') { + width += 1; + let _ = s.next(); + } else { + break; + } + } + + width_mappings.push(InnerWidthMapping::new(pos, width, 0)); + } + ('\\', Some((_, 'n' | 't' | 'r' | '0' | '\\' | '\'' | '\"'))) => { + width_mappings.push(InnerWidthMapping::new(pos, 2, 1)); + let _ = s.next(); + } + ('\\', Some((_, 'x'))) => { + // consume `\xAB` literal + s.nth(2); + width_mappings.push(InnerWidthMapping::new(pos, 4, 1)); + } + ('\\', Some((_, 'u'))) => { + let mut width = 2; + let _ = s.next(); + + if let Some((_, next_c)) = s.next() { + if next_c == '{' { + // consume up to 6 hexanumeric chars + let digits_len = s + .clone() + .take(6) + .take_while(|(_, c)| c.is_digit(16)) + .count(); + + let len_utf8 = s + .as_str() + .get(..digits_len) + .and_then(|digits| u32::from_str_radix(digits, 16).ok()) + .and_then(char::from_u32) + .map_or(1, char::len_utf8); + + // Skip the digits, for chars that encode to more than 1 utf-8 byte + // exclude as many digits as it is greater than 1 byte + // + // So for a 3 byte character, exclude 2 digits + let required_skips = digits_len.saturating_sub(len_utf8.saturating_sub(1)); + + // skip '{' and '}' also + width += required_skips + 2; + + s.nth(digits_len); + } else if next_c.is_digit(16) { + width += 1; + + // We suggest adding `{` and `}` when appropriate, accept it here as if + // it were correct + let mut i = 0; // consume up to 6 hexanumeric chars + while let (Some((_, c)), _) = (s.next(), i < 6) { + if c.is_digit(16) { + width += 1; + } else { + break; + } + i += 1; + } + } + } + + width_mappings.push(InnerWidthMapping::new(pos, width, 1)); + } + _ => {} + } + } + + InputStringKind::Literal { width_mappings } +} + +// TODO: I guess we can provide an `unescape_string` function to the parser... but how do we do that +// Store it in the parser struct? we need to make it FFI-aware +// SO this is not possible because we need `unescape_string` *before* we have a parser + +fn unescape_string(string: &str) -> Option { + // let mut buf = string::String::new(); + // let mut ok = true; + // unescape::unescape_literal(string, unescape::Mode::Str, &mut |_, unescaped_char| { + // match unescaped_char { + // Ok(c) => buf.push(c), + // Err(_) => ok = false, + // } + // }); + + let buf = string::String::from(string); + let ok = true; + + ok.then_some(buf) +} + +// Assert a reasonable size for `Piece` +// #[cfg(all(target_arch = "x86_64", target_pointer_width = "64"))] +// rustc_index::static_assert_size!(Piece<'_>, 16); + +// #[cfg(test)] +// mod tests; \ No newline at end of file diff --git a/libgrust/libformat_parser/src/bin.rs b/libgrust/libformat_parser/src/bin.rs new file mode 100644 index 00000000000..4b1f903ad5f --- /dev/null +++ b/libgrust/libformat_parser/src/bin.rs @@ -0,0 +1,7 @@ +use libformat_parser::rust; + +fn main() { + dbg!(rust::collect_pieces( + std::env::args().nth(1).unwrap().as_str() + )); +} diff --git a/libgrust/libformat_parser/src/lib.rs b/libgrust/libformat_parser/src/lib.rs new file mode 100644 index 00000000000..e6dc16eeb49 --- /dev/null +++ b/libgrust/libformat_parser/src/lib.rs @@ -0,0 +1,41 @@ +//! FFI interface for `rustc_format_parser` + +// what's the plan? Have a function return something that can be constructed into a vector? +// or an iterator? + +use std::ffi::CStr; + +// TODO: Use rustc's version here #3 +use generic_format_parser::Piece; + +// FIXME: Rename? +pub mod rust { + use generic_format_parser::{ParseMode, Parser, Piece}; + + pub fn collect_pieces(input: &str) -> Vec> { + // let parser = Parser::new(); + let parser = Parser::new(input, None, None, true, ParseMode::Format); + + parser.into_iter().collect() + } +} + +#[repr(C)] +pub struct PieceSlice { + base_ptr: *const Piece<'static /* FIXME: That's wrong */>, + len: usize, +} + +#[no_mangle] +pub extern "C" fn collect_pieces(input: *const libc::c_char) -> PieceSlice { + // FIXME: Add comment + let str = unsafe { CStr::from_ptr(input) }; + + // FIXME: No unwrap + let pieces = rust::collect_pieces(str.to_str().unwrap()); + + PieceSlice { + base_ptr: pieces.as_ptr(), + len: pieces.len(), + } +} -- 2.45.2