Adds the logic to handle -finput-charset in layout_get_source_line(), so that source lines are converted from their input encodings prior to being output by diagnostics machinery. gcc/c-family/ChangeLog: PR other/93067 * c-opts.c (c_common_post_options): Call new function input_initialize_cpp_context(). gcc/fortran/ChangeLog: PR other/93067 * cpp.c (gfc_cpp_post_options): Call new function input_initialize_cpp_context(). gcc/ChangeLog: PR other/93067 * input.c (input_initialize_cpp_context): New function. (read_data): Add prototype. (add_file_to_cache_tab): Use libcpp to convert input encoding when needed. (class fcache): Add new members to track input encoding conversion via libcpp. (fcache::fcache): Adapt for new members. (fcache::~fcache): Likewise. (maybe_grow): Likewise. (needs_read): Adapt to be aware that fp member may be NULL now. (get_next_line): Likewise. * input.h (struct cpp_reader): Forward declare for use... (input_initialize_cpp_context): ...here. Declare new function. libcpp/ChangeLog: PR other/93067 * charset.c (init_iconv_desc): Adapt to permit PFILE argument to be NULL. (_cpp_convert_input): Likewise. Also move UTF-8 BOM logic to... (cpp_check_utf8_bom): ...here. New function. (cpp_input_conversion_is_trivial): New function. * files.c (read_file_guts): Allow PFILE argument to be NULL. Add INPUT_CHARSET argument as an alternate source of this information. (cpp_get_converted_source): New function. * include/cpplib.h (struct cpp_converted_source): Declare. (cpp_get_converted_source): Declare. (cpp_input_conversion_is_trivial): Declare. (cpp_check_utf8_bom): Declare. gcc/testsuite/ChangeLog: PR other/93067 * gcc.dg/diagnostic-input-charset-1.c: New test. * gcc.dg/diagnostic-input-charset-2.c: New test. diff --git a/gcc/c-family/c-opts.c b/gcc/c-family/c-opts.c index 59cabd12407..d5aa7859cc1 100644 --- a/gcc/c-family/c-opts.c +++ b/gcc/c-family/c-opts.c @@ -1124,6 +1124,10 @@ c_common_post_options (const char **pfilename) cpp_post_options (parse_in); init_global_opts_from_cpp (&global_options, cpp_get_options (parse_in)); + /* Let diagnostics infrastructure know we are using libcpp to read + the input. */ + input_initialize_cpp_context (parse_in); + input_location = UNKNOWN_LOCATION; *pfilename = this_input_filename diff --git a/gcc/fortran/cpp.c b/gcc/fortran/cpp.c index 51baf141711..2b12a98afc0 100644 --- a/gcc/fortran/cpp.c +++ b/gcc/fortran/cpp.c @@ -493,6 +493,10 @@ gfc_cpp_post_options (void) cpp_post_options (cpp_in); + /* Let diagnostics infrastructure know we are using libcpp to read + the input. */ + input_initialize_cpp_context (cpp_in); + gfc_cpp_register_include_paths (); } diff --git a/gcc/input.c b/gcc/input.c index 29d10f06b86..1dcdd464bc1 100644 --- a/gcc/input.c +++ b/gcc/input.c @@ -30,6 +30,24 @@ along with GCC; see the file COPYING3. If not see #define HAVE_ICONV 0 #endif +/* If libcpp is being used to read the data, we need to note the configuration + so we can read files back in consistently in location_get_source_line(). */ +struct +{ + bool in_use; + bool conversion_is_trivial; + const char *charset; +} static input_cpp_context; + +void input_initialize_cpp_context (cpp_reader *cpp) +{ + input_cpp_context.in_use = true; + const cpp_options *opts = cpp_get_options (cpp); + input_cpp_context.charset = opts->input_charset; + input_cpp_context.conversion_is_trivial + = cpp_input_conversion_is_trivial (input_cpp_context.charset); +} + /* This is a cache used by get_next_line to store the content of a file to be searched for file lines. */ class fcache @@ -78,6 +96,10 @@ public: far. */ char *data; + /* The allocated buffer to be freed may start a little earlier than DATA, + e.g. if a UTF8 BOM was skipped at the beginning. */ + int alloc_offset; + /* The size of the DATA array above.*/ size_t size; @@ -118,6 +140,17 @@ public: fcache (); ~fcache (); + + void offset_buffer (int offset) + { + gcc_assert (offset < 0 ? alloc_offset + offset >= 0 + : (size_t) offset <= size); + gcc_assert (data); + alloc_offset += offset; + data += offset; + size -= offset; + } + }; /* Current position in real source file. */ @@ -364,6 +397,9 @@ evicted_cache_tab_entry (unsigned *highest_use_count) return to_evict; } +static bool +read_data (fcache *c); + /* Create the cache used for the content of a given file to be accessed by caret diagnostic. This cache is added to an array of cache and can be retrieved by lookup_file_in_cache_tab. This @@ -384,6 +420,8 @@ add_file_to_cache_tab (const char *file_path) if (r->fp) fclose (r->fp); r->fp = fp; + if (r->alloc_offset) + r->offset_buffer (-r->alloc_offset); r->nb_read = 0; r->line_start_idx = 0; r->line_num = 0; @@ -394,6 +432,42 @@ add_file_to_cache_tab (const char *file_path) r->total_lines = total_lines_num (file_path); r->missing_trailing_newline = true; + /* If libcpp is managing the reading, then there are two cases we need to + consider. If -finput-charset is not in effect, then we just need to + strip a UTF-8 BOM, so do that ourselves rather than calling into libcpp so + as to avoid paying the penalty of using libcpp, namely that the entire file + must be read at once. In the (generally rare) case that a non-trivial + -finput-charset is needed, then go ahead and use libcpp to read the whole + file and do the conversion. */ + if (input_cpp_context.in_use) + { + if (input_cpp_context.conversion_is_trivial) + { + /* Strip the UTF8 BOM if present. */ + if (read_data (r)) + { + const int offset = cpp_check_utf8_bom (r->data, r->nb_read); + r->offset_buffer (offset); + r->nb_read -= offset; + } + } + else + { + /* Need a full-blown conversion of the input charset. */ + fclose (r->fp); + r->fp = NULL; + const cpp_converted_source cs + = cpp_get_converted_source (file_path, input_cpp_context.charset); + if (!cs.data) + return NULL; + if (r->data) + XDELETEVEC (r->data); + r->data = cs.data; + r->nb_read = r->size = cs.len; + r->alloc_offset = cs.data - cs.to_free; + } + } + return r; } @@ -415,7 +489,7 @@ lookup_or_add_file_to_cache_tab (const char *file_path) diagnostic. */ fcache::fcache () -: use_count (0), file_path (NULL), fp (NULL), data (0), +: use_count (0), file_path (NULL), fp (NULL), data (0), alloc_offset (0), size (0), nb_read (0), line_start_idx (0), line_num (0), total_lines (0), missing_trailing_newline (true) { @@ -433,6 +507,7 @@ fcache::~fcache () } if (data) { + offset_buffer (-alloc_offset); XDELETEVEC (data); data = 0; } @@ -447,9 +522,9 @@ fcache::~fcache () static bool needs_read (fcache *c) { - return (c->nb_read == 0 - || c->nb_read == c->size - || (c->line_start_idx >= c->nb_read - 1)); + return c->fp && (c->nb_read == 0 + || c->nb_read == c->size + || (c->line_start_idx >= c->nb_read - 1)); } /* Return TRUE iff the cache is full and thus needs to be @@ -469,9 +544,20 @@ maybe_grow (fcache *c) if (!needs_grow (c)) return; - size_t size = c->size == 0 ? fcache_buffer_size : c->size * 2; - c->data = XRESIZEVEC (char, c->data, size); - c->size = size; + if (!c->data) + { + gcc_assert (c->size == 0 && c->alloc_offset == 0); + c->size = fcache_buffer_size; + c->data = XNEWVEC (char, c->size); + } + else + { + const int offset = c->alloc_offset; + c->offset_buffer (-offset); + c->size *= 2; + c->data = XRESIZEVEC (char, c->data, c->size); + c->offset_buffer (offset); + } } /* Read more data into the cache. Extends the cache if need be. @@ -570,7 +656,7 @@ get_next_line (fcache *c, char **line, ssize_t *line_len) c->missing_trailing_newline = false; } - if (ferror (c->fp)) + if (c->fp && ferror (c->fp)) return false; /* At this point, we've found the end of the of line. It either diff --git a/gcc/input.h b/gcc/input.h index 4790a571c6a..0f1c6dc1f27 100644 --- a/gcc/input.h +++ b/gcc/input.h @@ -214,4 +214,10 @@ class GTY(()) string_concat_db hash_map *m_table; }; +/* Because we may read files a 2nd time, after libcpp does, in order to emit + diagnostics, we need to be aware if libcpp is being used and how it has + been configured, e.g., to know the value of -finput-charset. This function + needs to be called by any frontend that is using libcpp to read its data. */ +struct cpp_reader; +void input_initialize_cpp_context (cpp_reader *cpp); #endif diff --git a/libcpp/charset.c b/libcpp/charset.c index 3e5578b1390..d6e4e096d33 100644 --- a/libcpp/charset.c +++ b/libcpp/charset.c @@ -630,7 +630,11 @@ static const struct cpp_conversion conversion_tab[] = { cset_converter structure for conversion from FROM to TO. If iconv_open() fails, issue an error and return an identity converter. Silently return an identity converter if FROM and TO - are identical. */ + are identical. + + PFILE is only used for generating diagnostics; setting it to NULL + suppresses diagnostics. */ + static struct cset_converter init_iconv_desc (cpp_reader *pfile, const char *to, const char *from) { @@ -672,25 +676,31 @@ init_iconv_desc (cpp_reader *pfile, const char *to, const char *from) if (ret.cd == (iconv_t) -1) { - if (errno == EINVAL) - cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */ - "conversion from %s to %s not supported by iconv", - from, to); - else - cpp_errno (pfile, CPP_DL_ERROR, "iconv_open"); - + if (pfile) + { + if (errno == EINVAL) + cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */ + "conversion from %s to %s not supported by iconv", + from, to); + else + cpp_errno (pfile, CPP_DL_ERROR, "iconv_open"); + } ret.func = convert_no_conversion; } } else { - cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */ - "no iconv implementation, cannot convert from %s to %s", - from, to); + if (pfile) + { + cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */ + "no iconv implementation, cannot convert from %s to %s", + from, to); + } ret.func = convert_no_conversion; ret.cd = (iconv_t) -1; ret.width = -1; } + return ret; } @@ -2122,6 +2132,25 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len) buf, bufp - buf, HT_ALLOC)); } + +/* Utility to strip a UTF-8 byte order marking from the beginning + of a buffer. Returns the number of bytes to skip, which currently + will be either 0 or 3. */ +int +cpp_check_utf8_bom (const char *data, size_t data_length) +{ + +#if HOST_CHARSET == HOST_CHARSET_ASCII + const unsigned char *udata = (const unsigned char *) data; + if (data_length >= 3 && udata[0] == 0xef && udata[1] == 0xbb + && udata[2] == 0xbf) + return 3; +#endif + + return 0; +} + + /* Convert an input buffer (containing the complete contents of one source file) from INPUT_CHARSET to the source character set. INPUT points to the input buffer, SIZE is its allocated size, and LEN is @@ -2135,7 +2164,11 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len) INPUT is expected to have been allocated with xmalloc. This function will either set *BUFFER_START to INPUT, or free it and set *BUFFER_START to a pointer to another xmalloc-allocated block of - memory. */ + memory. + + PFILE is only used to generate diagnostics; setting it to NULL suppresses + diagnostics, and causes a return of NULL if there was any error instead. */ + uchar * _cpp_convert_input (cpp_reader *pfile, const char *input_charset, uchar *input, size_t size, size_t len, @@ -2158,17 +2191,28 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset, to.text = XNEWVEC (uchar, to.asize); to.len = 0; - if (!APPLY_CONVERSION (input_cset, input, len, &to)) - cpp_error (pfile, CPP_DL_ERROR, - "failure to convert %s to %s", - CPP_OPTION (pfile, input_charset), SOURCE_CHARSET); + const bool ok = APPLY_CONVERSION (input_cset, input, len, &to); - free (input); - } + /* Clean up the mess. */ + if (input_cset.func == convert_using_iconv) + iconv_close (input_cset.cd); - /* Clean up the mess. */ - if (input_cset.func == convert_using_iconv) - iconv_close (input_cset.cd); + /* Handle conversion failure. */ + if (!ok) + { + free (input); + if (!pfile) + { + XDELETEVEC (to.text); + *buffer_start = NULL; + *st_size = 0; + return NULL; + } + cpp_error (pfile, CPP_DL_ERROR, + "failure to convert %s to %s", + CPP_OPTION (pfile, input_charset), SOURCE_CHARSET); + } + } /* Resize buffer if we allocated substantially too much, or if we haven't enough space for the \n-terminator or following @@ -2192,19 +2236,14 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset, buffer = to.text; *st_size = to.len; -#if HOST_CHARSET == HOST_CHARSET_ASCII - /* The HOST_CHARSET test just above ensures that the source charset - is UTF-8. So, ignore a UTF-8 BOM if we see one. Note that - glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a + + /* Ignore a UTF-8 BOM if we see one and the source charset is UTF-8. Note + that glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a BOM -- however, even if it did, we would still need this code due to the 'convert_no_conversion' case. */ - if (to.len >= 3 && to.text[0] == 0xef && to.text[1] == 0xbb - && to.text[2] == 0xbf) - { - *st_size -= 3; - buffer += 3; - } -#endif + const int bom_len = cpp_check_utf8_bom ((const char *) to.text, to.len); + *st_size -= bom_len; + buffer += bom_len; *buffer_start = to.text; return buffer; @@ -2244,6 +2283,13 @@ _cpp_default_encoding (void) return current_encoding; } +/* Check if the configured input charset requires no conversion, other than + possibly stripping a UTF-8 BOM. */ +bool cpp_input_conversion_is_trivial (const char *input_charset) +{ + return !strcasecmp (input_charset, SOURCE_CHARSET); +} + /* Implementation of class cpp_string_location_reader. */ /* Constructor for cpp_string_location_reader. */ diff --git a/libcpp/files.c b/libcpp/files.c index 301b2379a23..178bb9ed1e6 100644 --- a/libcpp/files.c +++ b/libcpp/files.c @@ -173,7 +173,7 @@ static bool pch_open_file (cpp_reader *pfile, _cpp_file *file, static bool find_file_in_dir (cpp_reader *pfile, _cpp_file *file, bool *invalid_pch, location_t loc); static bool read_file_guts (cpp_reader *pfile, _cpp_file *file, - location_t loc); + location_t loc, const char *input_charset = NULL); static bool read_file (cpp_reader *pfile, _cpp_file *file, location_t loc); static struct cpp_dir *search_path_head (cpp_reader *, const char *fname, @@ -671,18 +671,32 @@ _cpp_find_file (cpp_reader *pfile, const char *fname, cpp_dir *start_dir, Use LOC for any diagnostics. + The input charset may be specified in the INPUT_CHARSET argument, or + else it will be taken from PFILE. + + PFILE may be NULL. In this case, no diagnostics are issued, and the + input charset must be specified in the arguments. + FIXME: Flush file cache and try again if we run out of memory. */ static bool -read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc) +read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, + const char *input_charset) { ssize_t size, total, count; uchar *buf; bool regular; + if (!input_charset) + { + gcc_assert (pfile); + input_charset = CPP_OPTION (pfile, input_charset); + } + if (S_ISBLK (file->st.st_mode)) { - cpp_error_at (pfile, CPP_DL_ERROR, loc, - "%s is a block device", file->path); + if (pfile) + cpp_error_at (pfile, CPP_DL_ERROR, loc, + "%s is a block device", file->path); return false; } @@ -699,8 +713,9 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc) does not bite us. */ if (file->st.st_size > INTTYPE_MAXIMUM (ssize_t)) { - cpp_error_at (pfile, CPP_DL_ERROR, loc, - "%s is too large", file->path); + if (pfile) + cpp_error_at (pfile, CPP_DL_ERROR, loc, + "%s is too large", file->path); return false; } @@ -733,29 +748,29 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc) if (count < 0) { - cpp_errno_filename (pfile, CPP_DL_ERROR, file->path, loc); + if (pfile) + cpp_errno_filename (pfile, CPP_DL_ERROR, file->path, loc); free (buf); return false; } - if (regular && total != size && STAT_SIZE_RELIABLE (file->st)) + if (pfile && regular && total != size && STAT_SIZE_RELIABLE (file->st)) cpp_error_at (pfile, CPP_DL_WARNING, loc, "%s is shorter than expected", file->path); file->buffer = _cpp_convert_input (pfile, - CPP_OPTION (pfile, input_charset), + input_charset, buf, size + 16, total, &file->buffer_start, &file->st.st_size); - file->buffer_valid = true; - - return true; + file->buffer_valid = file->buffer; + return file->buffer_valid; } /* Convenience wrapper around read_file_guts that opens the file if necessary and closes the file descriptor after reading. FILE must have been passed through find_file() at some stage. Use LOC for - any diagnostics. */ + any diagnostics. Unlike read_file_guts(), PFILE may not be NULL. */ static bool read_file (cpp_reader *pfile, _cpp_file *file, location_t loc) { @@ -2118,3 +2133,25 @@ _cpp_has_header (cpp_reader *pfile, const char *fname, int angle_brackets, return file->err_no != ENOENT; } +/* Read a file and convert to input charset, the same as if it were being read + by a cpp_reader. */ + +cpp_converted_source +cpp_get_converted_source (const char *fname, const char *input_charset) +{ + cpp_converted_source res = {}; + _cpp_file file = {}; + file.fd = -1; + file.name = lbasename (fname); + file.path = fname; + if (!open_file (&file)) + return res; + const bool ok = read_file_guts (NULL, &file, 0, input_charset); + close (file.fd); + if (!ok) + return res; + res.to_free = (char *) file.buffer_start; + res.data = (char *) file.buffer; + res.len = file.st.st_size; + return res; +} diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h index 50d28dc9d5a..d38dd040367 100644 --- a/libcpp/include/cpplib.h +++ b/libcpp/include/cpplib.h @@ -1368,6 +1368,20 @@ extern struct _cpp_file *cpp_get_file (cpp_buffer *); extern cpp_buffer *cpp_get_prev (cpp_buffer *); extern void cpp_clear_file_cache (cpp_reader *); +/* cpp_get_converted_source returns the contents of the given file, as it exists + after cpplib has read it and converted it from the input charset to the + source charset. Return struct will be zero-filled if the data could not be + read for any reason. The data starts at the DATA pointer, but the TO_FREE + pointer is what should be passed to free(), as there may be an offset. */ +struct cpp_converted_source +{ + char *to_free; + char *data; + size_t len; +}; +cpp_converted_source cpp_get_converted_source (const char *fname, + const char *input_charset); + /* In pch.c */ struct save_macro_data; extern int cpp_save_state (cpp_reader *, FILE *); @@ -1438,6 +1452,7 @@ class cpp_display_width_computation { /* Convenience functions that are simple use cases for class cpp_display_width_computation. Tab characters will be expanded to spaces as determined by TABSTOP. */ + int cpp_byte_column_to_display_column (const char *data, int data_length, int column, int tabstop); inline int cpp_display_width (const char *data, int data_length, @@ -1450,4 +1465,7 @@ int cpp_display_column_to_byte_column (const char *data, int data_length, int display_col, int tabstop); int cpp_wcwidth (cppchar_t c); +bool cpp_input_conversion_is_trivial (const char *input_charset); +int cpp_check_utf8_bom (const char *data, size_t data_length); + #endif /* ! LIBCPP_CPPLIB_H */