* Re: [patch,avr] Fix PR 65657 - read from __memx address space tramples arguments to function call
2015-04-16 16:47 ` Georg-Johann Lay
@ 2015-04-16 16:49 ` Georg-Johann Lay
2015-04-16 16:55 ` Denis Chertykov
2015-04-17 7:45 ` Senthil Kumar Selvaraj
2 siblings, 0 replies; 9+ messages in thread
From: Georg-Johann Lay @ 2015-04-16 16:49 UTC (permalink / raw)
To: Senthil Kumar Selvaraj; +Cc: gcc-patches, chertykov
[-- Attachment #1: Type: text/plain, Size: 30 bytes --]
...and the sketch against 4.9
[-- Attachment #2: fixregs.diff --]
[-- Type: text/x-patch, Size: 6802 bytes --]
Index: config/avr/avr.c
===================================================================
--- config/avr/avr.c (revision 221321)
+++ config/avr/avr.c (working copy)
@@ -287,6 +287,94 @@ avr_to_int_mode (rtx x)
}
+static void
+avr_rest_of_handle_expand_xload (void)
+{
+ basic_block bb;
+
+ FOR_EACH_BB_FN (bb, cfun)
+ {
+ rtx insn, next;
+
+ FOR_BB_INSNS_SAFE (bb, insn, next)
+ {
+ enum attr_fixregs fixregs;
+
+ if (-1 == recog_memoized (insn)
+ || FIXREGS_NO == (fixregs = get_attr_fixregs (insn)))
+ {
+ continue;
+ }
+
+ avr_edump ("%?: Must fix insn:\n %r\n\n", insn);
+ if (dump_file)
+ avr_fdump (dump_file, ";; Fixing insn :\n %r\n\n", insn);
+
+ extract_insn (insn);
+
+ switch (fixregs)
+ {
+ default:
+ gcc_unreachable();
+
+ case FIXREGS_XLOADQI_A:
+ if (dump_file)
+ avr_fdump (dump_file, "$2 = %r\n\n", recog_data.operand[2]);
+ break;
+
+ case FIXREGS_XLOAD_A:
+ if (dump_file)
+ {
+ avr_fdump (dump_file, "$2 = %r\n", recog_data.operand[2]);
+ avr_fdump (dump_file, "$3 = %r\n", recog_data.operand[3]);
+ avr_fdump (dump_file, "$4 = %r\n\n", recog_data.operand[4]);
+ }
+ break;
+ }
+ } // insn
+ }
+}
+
+
+static const pass_data avr_pass_data_expand_xload =
+{
+ RTL_PASS, // type
+ "", // name (will be patched)
+ OPTGROUP_NONE, // optinfo_flags
+ false, // has_gate
+ true, // has_execute
+ TV_NONE, // tv_id
+ 0, // properties_required
+ 0, // properties_provided
+ 0, // properties_destroyed
+ 0, // todo_flags_start
+ // todo_flags_finish
+ TODO_df_finish | TODO_verify_rtl_sharing | TODO_verify_flow
+};
+
+
+class avr_pass_expand_xload : public rtl_opt_pass
+{
+public:
+ avr_pass_expand_xload (gcc::context *ctxt, const char *name)
+ : rtl_opt_pass (avr_pass_data_expand_xload, ctxt)
+ {
+ this->name = name;
+ }
+
+ unsigned int execute (void)
+ {
+ df_lr_add_problem ();
+ df_live_add_problem ();
+ df_analyze ();
+
+ avr_rest_of_handle_expand_xload();
+
+ return 0;
+ }
+}; // avr_pass_recompute_notes
+
+
static const pass_data avr_pass_data_recompute_notes =
{
RTL_PASS, // type
@@ -326,6 +414,11 @@ public:
static void
avr_register_passes (void)
{
+ /* This avr-specific fixed PR63633 for the case of mov insns. */
+
+ register_pass (new avr_pass_expand_xload (g, "avr-expand-xload"),
+ PASS_POS_INSERT_BEFORE, "split1", 1);
+
/* This avr-specific pass (re)computes insn notes, in particular REG_DEAD
notes which are used by `avr.c::reg_unused_after' and branch offset
computations. These notes must be correct, i.e. there must be no
Index: config/avr/avr.md
===================================================================
--- config/avr/avr.md (revision 221321)
+++ config/avr/avr.md (working copy)
@@ -165,6 +165,13 @@ (define_attr "isa"
standard"
(const_string "standard"))
+
+(define_attr "fixregs"
+ "xload_A, xloadQI_A,
+ no"
+ (const_string "no"))
+
+
(define_attr "enabled" ""
(cond [(eq_attr "isa" "standard")
(const_int 1)
@@ -494,9 +501,9 @@ (define_insn "load_<mode>_libgcc"
;; "xload8qi_A"
;; "xload8qq_A" "xload8uqq_A"
(define_insn_and_split "xload8<mode>_A"
- [(set (match_operand:ALL1 0 "register_operand" "=r")
- (match_operand:ALL1 1 "memory_operand" "m"))
- (clobber (reg:HI REG_Z))]
+ [(set (match_operand:ALL1 0 "register_operand" "=r")
+ (match_operand:ALL1 1 "memory_operand" "m"))
+ (clobber (match_operand:HI 2 "scratch_operand" "=z"))] ;; HI 30
"can_create_pseudo_p()
&& !avr_xload_libgcc_p (<MODE>mode)
&& avr_mem_memx_p (operands[1])
@@ -505,6 +512,8 @@ (define_insn_and_split "xload8<mode>_A"
"&& 1"
[(clobber (const_int 0))]
{
+ gcc_assert (SCRATCH != GET_CODE (operands[2]));
+
/* ; Split away the high part of the address. GCC's register allocator
; in not able to allocate segment registers and reload the resulting
; expressions. Notice that no address register can hold a PSImode. */
@@ -520,7 +529,9 @@ (define_insn_and_split "xload8<mode>_A"
set_mem_addr_space (SET_SRC (single_set (insn)),
MEM_ADDR_SPACE (operands[1]));
DONE;
- })
+ }
+ [(set_attr "fixregs" "xloadQI_A")])
+
;; "xloadqi_A" "xloadqq_A" "xloaduqq_A"
;; "xloadhi_A" "xloadhq_A" "xloaduhq_A" "xloadha_A" "xloaduha_A"
@@ -530,9 +541,9 @@ (define_insn_and_split "xload8<mode>_A"
(define_insn_and_split "xload<mode>_A"
[(set (match_operand:MOVMODE 0 "register_operand" "=r")
(match_operand:MOVMODE 1 "memory_operand" "m"))
- (clobber (reg:MOVMODE 22))
- (clobber (reg:QI 21))
- (clobber (reg:HI REG_Z))]
+ (clobber (match_operand:MOVMODE 2 "scratch_operand" "=r")) ;; MOVMODE 22
+ (clobber (match_operand:QI 3 "scratch_operand" "=r")) ;; QI 21
+ (clobber (match_operand:HI 4 "scratch_operand" "=z"))] ;; HI 30 (Z)
"can_create_pseudo_p()
&& avr_mem_memx_p (operands[1])
&& REG_P (XEXP (operands[1], 0))"
@@ -540,6 +551,10 @@ (define_insn_and_split "xload<mode>_A"
"&& 1"
[(clobber (const_int 0))]
{
+ gcc_assert (SCRATCH != GET_CODE (operands[2]));
+ gcc_assert (SCRATCH != GET_CODE (operands[3]));
+ gcc_assert (SCRATCH != GET_CODE (operands[4]));
+
rtx addr = XEXP (operands[1], 0);
rtx reg_z = gen_rtx_REG (HImode, REG_Z);
rtx addr_hi8 = simplify_gen_subreg (QImode, addr, PSImode, 2);
@@ -558,7 +573,9 @@ (define_insn_and_split "xload<mode>_A"
emit_move_insn (operands[0], gen_rtx_REG (<MODE>mode, 22));
DONE;
- })
+ }
+ [(set_attr "fixregs" "xload_A")])
+
;; Move value from address space memx to a register
;; These insns must be prior to respective generic move insn.
@@ -638,10 +655,10 @@ (define_expand "mov<mode>"
if (!avr_xload_libgcc_p (<MODE>mode))
/* ; No <mode> here because gen_xload8<mode>_A only iterates over ALL1.
; insn-emit does not depend on the mode, it's all about operands. */
- emit_insn (gen_xload8qi_A (dest, src));
+ emit_insn (gen_xload8qi_A (dest, src, gen_rtx_SCRATCH (HImode)));
else
- emit_insn (gen_xload<mode>_A (dest, src));
-
+ emit_insn (gen_xload<mode>_A (dest, src, gen_rtx_SCRATCH (<MODE>mode),
+ gen_rtx_SCRATCH (QImode), gen_rtx_SCRATCH (HImode)));
DONE;
}
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch,avr] Fix PR 65657 - read from __memx address space tramples arguments to function call
2015-04-16 16:47 ` Georg-Johann Lay
2015-04-16 16:49 ` Georg-Johann Lay
@ 2015-04-16 16:55 ` Denis Chertykov
2015-04-17 7:45 ` Senthil Kumar Selvaraj
2 siblings, 0 replies; 9+ messages in thread
From: Denis Chertykov @ 2015-04-16 16:55 UTC (permalink / raw)
To: Georg-Johann Lay; +Cc: Senthil Kumar Selvaraj, gcc-patches
2015-04-16 19:47 GMT+03:00 Georg-Johann Lay <avr@gjlay.de>:
>
> Am 04/16/2015 um 11:28 AM schrieb Senthil Kumar Selvaraj:
>>
>> On Thu, Apr 16, 2015 at 11:02:05AM +0200, Georg-Johann Lay wrote:
>>>
>>> Am 04/16/2015 um 08:43 AM schrieb Senthil Kumar Selvaraj:
>>>>
>>>> This patch fixes PR 65657.
>>>
>>>
>>> The following artifact appears to be PR63633.
>>>
>>
>> I did see that one - unfortunately, that fix won't help here. IIUC, you
>> check if input/output operand hard regs are in the clobber list,
>> and if yes, you generate pseudos to save and restore clobbered hard
>> regs.
>>
>> In this case, the reg is actually clobbered by a different insn (one
>
>
> Arrgh, yes...
>
>> that loads the next argument to the function). So unless I blindly generate pseudos for
>> all regs in the clobber list, the clobbering will still happen.
>>
>> FWIW, I did try saving/restoring all clobbered regs, and it did fix the
>> problem - just that it appeared like a (worse) hack to me. Aren't we
>> manually replicating what RA/reload should be doing?
>
>
> As it appears, we'll have to do it by hand. The attaches patch is just a sketch that indicates how the problem could be approached. Notice the new assertions in the split expanders; they will throw ICE until the fix is actually installed.
>
> The critical insn are generated in movMM expander and are changed to have no clobbers (SCRATCHes actually). An a later pass, when appropriate life info can be made available, run yet another avr pass that
>
> 1a) Save-restore needed hard regs around the insn.
>
> 2a) Kick out hard regs overlapping the clobbers, e.g. in set_src, into new pseudos. Maybe that could happen due to some hard regs progagation, or we can use a new predicate similar combine_pseudo_register_operand.
>
> 3) Replace scratch -> hard regs for all scratch_operands.
>
> 2b) Restore respective hard regs from their pseudos.
>
> 1b) Restore respective hard regs from their pseudos.
>
>
> And maybe we can get better code by allocating things like address register by hand and get better code then.
>
> When I implemented some of the libgcc insns I tried to express the operand by means of constraints, e.h. for (reg:HI 22) and let register allocator do the job.
>
> The resulting code was functional but *horrific*.
>
> The register allocator is not yet ready to generate efficient code in such demanding situations...
>
>>
>> What do you think?
>>
>
> IMO sooner or later we'll need such an infrastructure; maybe also for non-mov insn that are implemented by transparent libcalls like divmod, mul, etc.
>
>>>> When cfgexpand.c expands a function call, it first figures out the
>>>> registers in which the arguments will go, followed by expansion of the
>>>> arguments themselves (right to left). It later emits mov insns to set
>>>> the precomputed registers with the expanded RTXes.
>>>>
>>>> If one of the arguments is a __memx char pointer dereference, the mov
>>>> eventually expands to gen_xload<mode>_A (for certain cases), which
>>>> clobbers R22, R21 and Z. This causes problems if one of those
>>>> clobbered registers was precomputed to hold another argument.
>>>>
>>>> In general, call expansion does not appear to take clobbers into account -
>
>
> We had been warned that using hard regs is evil... But without that technique the code quality would decrease way too much.
>
>>>> when it checks for argument overlap, the RTX (args[i].value) is only a MEM
>>>> in QImode for the memx deref - the clobber shows up when it eventually
>>>> calls emit_move_insn, at which point, it is too late.
>
>
> Such situations could only be handled by a target hook which allowed to expand specific trees by hand... Such a hook could cater for insn that must use hard registers.
>
>>>> This does not happen for a int pointer dereference - turns out that
>>>> precompute_register_parameters does a copy_to_mode_reg if the
>>>> cost of args[i].value is more than COSTS_N_INSNS(1) i.e., it creates a
>>>> pseudo and later assigns the pseudo to the arg register. This is done
>>>> before any moves to arg registers is done, so other arguments are not
>>>> overwritten.
>>>>
>>>> Doing the same thing - providing a better cost estimate for a MEM rtx in
>>>> the non-default address space, makes this problem go away, and that is
>>>> what this patch does. Regression testing does not show any new failures.
>
>
> Can you tell something about overall code quality? If it is not significantly worse then I'd propose to apply your rtx-costs solution soon. The full fix will take more time to work it out.
I'm agree with Georg.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch,avr] Fix PR 65657 - read from __memx address space tramples arguments to function call
2015-04-16 16:47 ` Georg-Johann Lay
2015-04-16 16:49 ` Georg-Johann Lay
2015-04-16 16:55 ` Denis Chertykov
@ 2015-04-17 7:45 ` Senthil Kumar Selvaraj
2015-04-17 11:34 ` Denis Chertykov
2015-04-23 7:02 ` Senthil Kumar Selvaraj
2 siblings, 2 replies; 9+ messages in thread
From: Senthil Kumar Selvaraj @ 2015-04-17 7:45 UTC (permalink / raw)
To: Georg-Johann Lay; +Cc: gcc-patches, chertykov
On Thu, Apr 16, 2015 at 06:47:26PM +0200, Georg-Johann Lay wrote:
> Am 04/16/2015 um 11:28 AM schrieb Senthil Kumar Selvaraj:
> >On Thu, Apr 16, 2015 at 11:02:05AM +0200, Georg-Johann Lay wrote:
> >>Am 04/16/2015 um 08:43 AM schrieb Senthil Kumar Selvaraj:
> >>>This patch fixes PR 65657.
> >>
> >>The following artifact appears to be PR63633.
> >>
> >
> >I did see that one - unfortunately, that fix won't help here. IIUC, you
> >check if input/output operand hard regs are in the clobber list,
> >and if yes, you generate pseudos to save and restore clobbered hard
> >regs.
> >
> >In this case, the reg is actually clobbered by a different insn (one
>
> Arrgh, yes...
>
> >that loads the next argument to the function). So unless I blindly generate pseudos for
> >all regs in the clobber list, the clobbering will still happen.
> >
> >FWIW, I did try saving/restoring all clobbered regs, and it did fix the
> >problem - just that it appeared like a (worse) hack to me. Aren't we
> >manually replicating what RA/reload should be doing?
>
> As it appears, we'll have to do it by hand. The attaches patch is just a
> sketch that indicates how the problem could be approached. Notice the new
> assertions in the split expanders; they will throw ICE until the fix is
> actually installed.
>
> The critical insn are generated in movMM expander and are changed to have no
> clobbers (SCRATCHes actually). An a later pass, when appropriate life info
> can be made available, run yet another avr pass that
>
> 1a) Save-restore needed hard regs around the insn.
>
> 2a) Kick out hard regs overlapping the clobbers, e.g. in set_src, into new
> pseudos. Maybe that could happen due to some hard regs progagation, or we
> can use a new predicate similar combine_pseudo_register_operand.
>
> 3) Replace scratch -> hard regs for all scratch_operands.
>
> 2b) Restore respective hard regs from their pseudos.
>
> 1b) Restore respective hard regs from their pseudos.
>
>
> And maybe we can get better code by allocating things like address register
> by hand and get better code then.
>
> When I implemented some of the libgcc insns I tried to express the operand
> by means of constraints, e.h. for (reg:HI 22) and let register allocator do
> the job.
>
> The resulting code was functional but *horrific*.
>
> The register allocator is not yet ready to generate efficient code in such
> demanding situations...
>
> >
> >What do you think?
> >
>
> IMO sooner or later we'll need such an infrastructure; maybe also for
> non-mov insn that are implemented by transparent libcalls like divmod, mul,
> etc.
I'm curious how other embedded targets handle this though - arm, for
example? Surely they should have some libcalls/builtins which need
specific hard regs? I should check that out.
>
> >>>When cfgexpand.c expands a function call, it first figures out the
> >>>registers in which the arguments will go, followed by expansion of the
> >>>arguments themselves (right to left). It later emits mov insns to set
> >>>the precomputed registers with the expanded RTXes.
> >>>
> >>>If one of the arguments is a __memx char pointer dereference, the mov
> >>>eventually expands to gen_xload<mode>_A (for certain cases), which
> >>>clobbers R22, R21 and Z. This causes problems if one of those
> >>>clobbered registers was precomputed to hold another argument.
> >>>
> >>>In general, call expansion does not appear to take clobbers into account -
>
> We had been warned that using hard regs is evil... But without that
> technique the code quality would decrease way too much.
Oh ok - do you know some place where this is documented or was
discussed?
>
> >>>when it checks for argument overlap, the RTX (args[i].value) is only a MEM
> >>>in QImode for the memx deref - the clobber shows up when it eventually
> >>>calls emit_move_insn, at which point, it is too late.
>
> Such situations could only be handled by a target hook which allowed to
> expand specific trees by hand... Such a hook could cater for insn that must
> use hard registers.
Yes, I guess so :(
>
> >>>This does not happen for a int pointer dereference - turns out that
> >>>precompute_register_parameters does a copy_to_mode_reg if the
> >>>cost of args[i].value is more than COSTS_N_INSNS(1) i.e., it creates a
> >>>pseudo and later assigns the pseudo to the arg register. This is done
> >>>before any moves to arg registers is done, so other arguments are not
> >>>overwritten.
> >>>
> >>>Doing the same thing - providing a better cost estimate for a MEM rtx in
> >>>the non-default address space, makes this problem go away, and that is
> >>>what this patch does. Regression testing does not show any new failures.
>
> Can you tell something about overall code quality? If it is not
> significantly worse then I'd propose to apply your rtx-costs solution soon.
> The full fix will take more time to work it out.
>
For this piece of code -
void foo ( char c, unsigned int d);
void readx (const char __memx *x)
{
foo (*x, 0xABCD);
}
the buggy code (for readx) looks like this
mov r18,r22
mov r25,r23
ldi r22,lo8(-51)
ldi r23,lo8(-85)
mov r30,r18
mov r31,r25
mov r21,r24
call __xload_1
mov r24,r22
jmp foo
With the patch, the code looks like this
movw r30,r22
mov r21,r24
call __xload_1
mov r24,r22
ldi r22,lo8(-51)
ldi r23,lo8(-85)
jmp foo
The code turns out to be better in this case, as the loads to r22,r23
get done later.
The dump for cfgexpand is where the change originates - the memx dereference is
saved in a pseudo before the moves to argument registers start.
Before:
(insn 2 4 3 2 (set (reg/v/f:PSI 43 [ x ])
(reg:PSI 22 r22 [ x ])) foo.c:4 -1
(nil))
(note 3 2 6 2 NOTE_INSN_FUNCTION_BEG)
(insn 6 3 7 2 (set (reg:HI 22 r22)
(const_int -21555 [0xffffffffffffabcd])) foo.c:5 -1
(nil))
(insn 7 6 8 2 (parallel [
(set (reg:QI 24 r24)
(mem:QI (reg/v/f:PSI 43 [ x ]) [0 *x_2(D)+0 S1 A8 AS7]))
(clobber (reg:QI 22 r22))
(clobber (reg:QI 21 r21))
(clobber (reg:HI 30 r30))
]) foo.c:5 -1
(nil))
(call_insn/j 8 7 9 2 (parallel [
(call (mem:HI (symbol_ref:HI ("foo") [flags 0x41] <function_decl 0x7f7129589900 foo>) [0 foo S2 A8])
(const_int 0 [0]))
(use (const_int 1 [0x1]))
]) foo.c:5 -1
(nil)
(expr_list:REG_EQUAL (use (reg:QI 24 r24))
(expr_list:REG_NONNEG (use (reg:HI 22 r22))
(nil))))
(barrier 9 8 0)
After:
(insn 2 4 3 2 (set (reg/v/f:PSI 43 [ x ])
(reg:PSI 22 r22 [ x ])) foo.c:4 -1
(nil))
(note 3 2 6 2 NOTE_INSN_FUNCTION_BEG)
(insn 6 3 7 2 (parallel [
(set (reg:QI 44)
(mem:QI (reg/v/f:PSI 43 [ x ]) [0 *x_2(D)+0 S1 A8 AS7]))
(clobber (reg:QI 22 r22))
(clobber (reg:QI 21 r21))
(clobber (reg:HI 30 r30))
]) foo.c:5 -1
(nil))
(insn 7 6 8 2 (set (reg:HI 22 r22)
(const_int -21555 [0xffffffffffffabcd])) foo.c:5 -1
(nil))
(insn 8 7 9 2 (set (reg:QI 24 r24)
(reg:QI 44)) foo.c:5 -1
(nil))
(call_insn/j 9 8 10 2 (parallel [
(call (mem:HI (symbol_ref:HI ("foo") [flags 0x41] <function_decl 0x7fa986a0f900 foo>) [0 foo S2 A8])
(const_int 0 [0]))
(use (const_int 1 [0x1]))
]) foo.c:5 -1
(nil)
(expr_list:REG_EQUAL (use (reg:QI 24 r24))
(expr_list:REG_NONNEG (use (reg:HI 22 r22))
(nil))))
(barrier 10 9 0)
Without the patch, and with the arguments
reversed, like this
void foo ( unsigned int d, char c);
void readx (const char __memx *x)
{
foo (0xABCD, *x);
}
the compiler elides the dereference itself - it just emits
ldi r24,lo8(-51)
ldi r25,lo8(-85)
jmp foo
Scary :). That's because expand has
(insn 6 3 7 2 (parallel [
(set (reg:QI 22 r22)
(mem:QI (reg/v/f:PSI 43 [ x ]) [0 *x_2(D)+0 S1 A8 AS7]))
(clobber (reg:QI 22 r22))
(clobber (reg:QI 21 r21))
(clobber (reg:HI 30 r30))
]) foo.c:5 -1
(nil))
but this, I guess, will not happen if the fix for PR63633 was applied.
With my cost patch, I get
movw r30,r22
mov r21,r24
call __xload_1
ldi r24,lo8(-51)
ldi r25,lo8(-85)
jmp foo
For the testcase posted originally for the bug,
<snip>
extern void writeFlashByte(uint8_t byte, uint16_t handle);
void writeBuf(const uint8_t __memx *from, const uint8_t __memx *to)
{
uint16_t handle = ((uint16_t)(((__uint24)to) & 0xFFFFUL));
writeFlashByte(*from++, handle++);
}
</snip>
Before:
mov r27,r22
mov r26,r23
mov r21,r24
mov r24,r20
movw r22,r18
mov r30,r27
mov r31,r26
call __xload_1
mov r24,r22
jmp writeFlashByte
After:
push r16
push r17
movw r16,r18
mov r18,r20
movw r30,r22
mov r21,r24
call __xload_1
mov r24,r22
movw r22,r16
pop r17
pop r16
jmp writeFlashByte
Looks like worse code, but reload had to choose call saved regs (r16 and r17) for zero_extendpsisi's
output operand, as we don't allow REG_X (or greater) for SI mode values
in avr_hard_regno_mode_ok.
Looks reasonable to me - what do you guys say?
Regards
Senthil
>
> Johann
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch,avr] Fix PR 65657 - read from __memx address space tramples arguments to function call
2015-04-17 7:45 ` Senthil Kumar Selvaraj
@ 2015-04-17 11:34 ` Denis Chertykov
2015-04-23 7:02 ` Senthil Kumar Selvaraj
1 sibling, 0 replies; 9+ messages in thread
From: Denis Chertykov @ 2015-04-17 11:34 UTC (permalink / raw)
To: Senthil Kumar Selvaraj; +Cc: Georg-Johann Lay, gcc-patches
2015-04-17 10:46 GMT+03:00 Senthil Kumar Selvaraj
<senthil_kumar.selvaraj@atmel.com>:
>
> On Thu, Apr 16, 2015 at 06:47:26PM +0200, Georg-Johann Lay wrote:
> > Am 04/16/2015 um 11:28 AM schrieb Senthil Kumar Selvaraj:
> > >On Thu, Apr 16, 2015 at 11:02:05AM +0200, Georg-Johann Lay wrote:
> > >>Am 04/16/2015 um 08:43 AM schrieb Senthil Kumar Selvaraj:
> > >>>This patch fixes PR 65657.
> > >>
> > >>The following artifact appears to be PR63633.
> > >>
> > >
> > >I did see that one - unfortunately, that fix won't help here. IIUC, you
> > >check if input/output operand hard regs are in the clobber list,
> > >and if yes, you generate pseudos to save and restore clobbered hard
> > >regs.
> > >
> > >In this case, the reg is actually clobbered by a different insn (one
> >
> > Arrgh, yes...
> >
> > >that loads the next argument to the function). So unless I blindly generate pseudos for
> > >all regs in the clobber list, the clobbering will still happen.
> > >
> > >FWIW, I did try saving/restoring all clobbered regs, and it did fix the
> > >problem - just that it appeared like a (worse) hack to me. Aren't we
> > >manually replicating what RA/reload should be doing?
> >
> > As it appears, we'll have to do it by hand. The attaches patch is just a
> > sketch that indicates how the problem could be approached. Notice the new
> > assertions in the split expanders; they will throw ICE until the fix is
> > actually installed.
> >
> > The critical insn are generated in movMM expander and are changed to have no
> > clobbers (SCRATCHes actually). An a later pass, when appropriate life info
> > can be made available, run yet another avr pass that
> >
> > 1a) Save-restore needed hard regs around the insn.
> >
> > 2a) Kick out hard regs overlapping the clobbers, e.g. in set_src, into new
> > pseudos. Maybe that could happen due to some hard regs progagation, or we
> > can use a new predicate similar combine_pseudo_register_operand.
> >
> > 3) Replace scratch -> hard regs for all scratch_operands.
> >
> > 2b) Restore respective hard regs from their pseudos.
> >
> > 1b) Restore respective hard regs from their pseudos.
> >
> >
> > And maybe we can get better code by allocating things like address register
> > by hand and get better code then.
> >
> > When I implemented some of the libgcc insns I tried to express the operand
> > by means of constraints, e.h. for (reg:HI 22) and let register allocator do
> > the job.
> >
> > The resulting code was functional but *horrific*.
> >
> > The register allocator is not yet ready to generate efficient code in such
> > demanding situations...
> >
> > >
> > >What do you think?
> > >
> >
> > IMO sooner or later we'll need such an infrastructure; maybe also for
> > non-mov insn that are implemented by transparent libcalls like divmod, mul,
> > etc.
>
> I'm curious how other embedded targets handle this though - arm, for
> example? Surely they should have some libcalls/builtins which need
> specific hard regs? I should check that out.
>
> >
> > >>>When cfgexpand.c expands a function call, it first figures out the
> > >>>registers in which the arguments will go, followed by expansion of the
> > >>>arguments themselves (right to left). It later emits mov insns to set
> > >>>the precomputed registers with the expanded RTXes.
> > >>>
> > >>>If one of the arguments is a __memx char pointer dereference, the mov
> > >>>eventually expands to gen_xload<mode>_A (for certain cases), which
> > >>>clobbers R22, R21 and Z. This causes problems if one of those
> > >>>clobbered registers was precomputed to hold another argument.
> > >>>
> > >>>In general, call expansion does not appear to take clobbers into account -
> >
> > We had been warned that using hard regs is evil... But without that
> > technique the code quality would decrease way too much.
>
> Oh ok - do you know some place where this is documented or was
> discussed?
https://gcc.gnu.org/ml/gcc/2014-10/msg00207.html
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch,avr] Fix PR 65657 - read from __memx address space tramples arguments to function call
2015-04-17 7:45 ` Senthil Kumar Selvaraj
2015-04-17 11:34 ` Denis Chertykov
@ 2015-04-23 7:02 ` Senthil Kumar Selvaraj
1 sibling, 0 replies; 9+ messages in thread
From: Senthil Kumar Selvaraj @ 2015-04-23 7:02 UTC (permalink / raw)
To: Georg-Johann Lay; +Cc: gcc-patches, chertykov
On Fri, Apr 17, 2015 at 01:16:58PM +0530, Senthil Kumar Selvaraj wrote:
> On Thu, Apr 16, 2015 at 06:47:26PM +0200, Georg-Johann Lay wrote:
> > Am 04/16/2015 um 11:28 AM schrieb Senthil Kumar Selvaraj:
> > >On Thu, Apr 16, 2015 at 11:02:05AM +0200, Georg-Johann Lay wrote:
> > >>Am 04/16/2015 um 08:43 AM schrieb Senthil Kumar Selvaraj:
> > >>>This patch fixes PR 65657.
> > >>
> > >>The following artifact appears to be PR63633.
> > >>
> > >
> > >I did see that one - unfortunately, that fix won't help here. IIUC, you
> > >check if input/output operand hard regs are in the clobber list,
> > >and if yes, you generate pseudos to save and restore clobbered hard
> > >regs.
> > >
> > >In this case, the reg is actually clobbered by a different insn (one
> >
> > Arrgh, yes...
> >
> > >that loads the next argument to the function). So unless I blindly generate pseudos for
> > >all regs in the clobber list, the clobbering will still happen.
> > >
> > >FWIW, I did try saving/restoring all clobbered regs, and it did fix the
> > >problem - just that it appeared like a (worse) hack to me. Aren't we
> > >manually replicating what RA/reload should be doing?
> >
> > As it appears, we'll have to do it by hand. The attaches patch is just a
> > sketch that indicates how the problem could be approached. Notice the new
> > assertions in the split expanders; they will throw ICE until the fix is
> > actually installed.
> >
> > The critical insn are generated in movMM expander and are changed to have no
> > clobbers (SCRATCHes actually). An a later pass, when appropriate life info
> > can be made available, run yet another avr pass that
> >
> > 1a) Save-restore needed hard regs around the insn.
> >
> > 2a) Kick out hard regs overlapping the clobbers, e.g. in set_src, into new
> > pseudos. Maybe that could happen due to some hard regs progagation, or we
> > can use a new predicate similar combine_pseudo_register_operand.
> >
> > 3) Replace scratch -> hard regs for all scratch_operands.
> >
> > 2b) Restore respective hard regs from their pseudos.
> >
> > 1b) Restore respective hard regs from their pseudos.
> >
> >
> > And maybe we can get better code by allocating things like address register
> > by hand and get better code then.
> >
> > When I implemented some of the libgcc insns I tried to express the operand
> > by means of constraints, e.h. for (reg:HI 22) and let register allocator do
> > the job.
> >
> > The resulting code was functional but *horrific*.
> >
> > The register allocator is not yet ready to generate efficient code in such
> > demanding situations...
> >
> > >
> > >What do you think?
> > >
> >
> > IMO sooner or later we'll need such an infrastructure; maybe also for
> > non-mov insn that are implemented by transparent libcalls like divmod, mul,
> > etc.
>
> I'm curious how other embedded targets handle this though - arm, for
> example? Surely they should have some libcalls/builtins which need
> specific hard regs? I should check that out.
>
> >
> > >>>When cfgexpand.c expands a function call, it first figures out the
> > >>>registers in which the arguments will go, followed by expansion of the
> > >>>arguments themselves (right to left). It later emits mov insns to set
> > >>>the precomputed registers with the expanded RTXes.
> > >>>
> > >>>If one of the arguments is a __memx char pointer dereference, the mov
> > >>>eventually expands to gen_xload<mode>_A (for certain cases), which
> > >>>clobbers R22, R21 and Z. This causes problems if one of those
> > >>>clobbered registers was precomputed to hold another argument.
> > >>>
> > >>>In general, call expansion does not appear to take clobbers into account -
> >
> > We had been warned that using hard regs is evil... But without that
> > technique the code quality would decrease way too much.
>
> Oh ok - do you know some place where this is documented or was
> discussed?
> >
> > >>>when it checks for argument overlap, the RTX (args[i].value) is only a MEM
> > >>>in QImode for the memx deref - the clobber shows up when it eventually
> > >>>calls emit_move_insn, at which point, it is too late.
> >
> > Such situations could only be handled by a target hook which allowed to
> > expand specific trees by hand... Such a hook could cater for insn that must
> > use hard registers.
>
> Yes, I guess so :(
> >
> > >>>This does not happen for a int pointer dereference - turns out that
> > >>>precompute_register_parameters does a copy_to_mode_reg if the
> > >>>cost of args[i].value is more than COSTS_N_INSNS(1) i.e., it creates a
> > >>>pseudo and later assigns the pseudo to the arg register. This is done
> > >>>before any moves to arg registers is done, so other arguments are not
> > >>>overwritten.
> > >>>
> > >>>Doing the same thing - providing a better cost estimate for a MEM rtx in
> > >>>the non-default address space, makes this problem go away, and that is
> > >>>what this patch does. Regression testing does not show any new failures.
> >
> > Can you tell something about overall code quality? If it is not
> > significantly worse then I'd propose to apply your rtx-costs solution soon.
> > The full fix will take more time to work it out.
> >
>
> For this piece of code -
>
> void foo ( char c, unsigned int d);
> void readx (const char __memx *x)
> {
> foo (*x, 0xABCD);
> }
>
> the buggy code (for readx) looks like this
>
> mov r18,r22
> mov r25,r23
> ldi r22,lo8(-51)
> ldi r23,lo8(-85)
> mov r30,r18
> mov r31,r25
> mov r21,r24
> call __xload_1
> mov r24,r22
> jmp foo
>
> With the patch, the code looks like this
>
> movw r30,r22
> mov r21,r24
> call __xload_1
> mov r24,r22
> ldi r22,lo8(-51)
> ldi r23,lo8(-85)
> jmp foo
>
> The code turns out to be better in this case, as the loads to r22,r23
> get done later.
>
> The dump for cfgexpand is where the change originates - the memx dereference is
> saved in a pseudo before the moves to argument registers start.
>
> Before:
>
> (insn 2 4 3 2 (set (reg/v/f:PSI 43 [ x ])
> (reg:PSI 22 r22 [ x ])) foo.c:4 -1
> (nil))
> (note 3 2 6 2 NOTE_INSN_FUNCTION_BEG)
> (insn 6 3 7 2 (set (reg:HI 22 r22)
> (const_int -21555 [0xffffffffffffabcd])) foo.c:5 -1
> (nil))
> (insn 7 6 8 2 (parallel [
> (set (reg:QI 24 r24)
> (mem:QI (reg/v/f:PSI 43 [ x ]) [0 *x_2(D)+0 S1 A8 AS7]))
> (clobber (reg:QI 22 r22))
> (clobber (reg:QI 21 r21))
> (clobber (reg:HI 30 r30))
> ]) foo.c:5 -1
> (nil))
> (call_insn/j 8 7 9 2 (parallel [
> (call (mem:HI (symbol_ref:HI ("foo") [flags 0x41] <function_decl 0x7f7129589900 foo>) [0 foo S2 A8])
> (const_int 0 [0]))
> (use (const_int 1 [0x1]))
> ]) foo.c:5 -1
> (nil)
> (expr_list:REG_EQUAL (use (reg:QI 24 r24))
> (expr_list:REG_NONNEG (use (reg:HI 22 r22))
> (nil))))
> (barrier 9 8 0)
>
>
> After:
>
> (insn 2 4 3 2 (set (reg/v/f:PSI 43 [ x ])
> (reg:PSI 22 r22 [ x ])) foo.c:4 -1
> (nil))
> (note 3 2 6 2 NOTE_INSN_FUNCTION_BEG)
> (insn 6 3 7 2 (parallel [
> (set (reg:QI 44)
> (mem:QI (reg/v/f:PSI 43 [ x ]) [0 *x_2(D)+0 S1 A8 AS7]))
> (clobber (reg:QI 22 r22))
> (clobber (reg:QI 21 r21))
> (clobber (reg:HI 30 r30))
> ]) foo.c:5 -1
> (nil))
> (insn 7 6 8 2 (set (reg:HI 22 r22)
> (const_int -21555 [0xffffffffffffabcd])) foo.c:5 -1
> (nil))
> (insn 8 7 9 2 (set (reg:QI 24 r24)
> (reg:QI 44)) foo.c:5 -1
> (nil))
> (call_insn/j 9 8 10 2 (parallel [
> (call (mem:HI (symbol_ref:HI ("foo") [flags 0x41] <function_decl 0x7fa986a0f900 foo>) [0 foo S2 A8])
> (const_int 0 [0]))
> (use (const_int 1 [0x1]))
> ]) foo.c:5 -1
> (nil)
> (expr_list:REG_EQUAL (use (reg:QI 24 r24))
> (expr_list:REG_NONNEG (use (reg:HI 22 r22))
> (nil))))
> (barrier 10 9 0)
>
> Without the patch, and with the arguments
> reversed, like this
>
> void foo ( unsigned int d, char c);
> void readx (const char __memx *x)
> {
> foo (0xABCD, *x);
> }
>
> the compiler elides the dereference itself - it just emits
>
> ldi r24,lo8(-51)
> ldi r25,lo8(-85)
> jmp foo
>
> Scary :). That's because expand has
>
> (insn 6 3 7 2 (parallel [
> (set (reg:QI 22 r22)
> (mem:QI (reg/v/f:PSI 43 [ x ]) [0 *x_2(D)+0 S1 A8 AS7]))
> (clobber (reg:QI 22 r22))
> (clobber (reg:QI 21 r21))
> (clobber (reg:HI 30 r30))
> ]) foo.c:5 -1
> (nil))
>
> but this, I guess, will not happen if the fix for PR63633 was applied.
>
> With my cost patch, I get
>
> movw r30,r22
> mov r21,r24
> call __xload_1
> ldi r24,lo8(-51)
> ldi r25,lo8(-85)
> jmp foo
>
> For the testcase posted originally for the bug,
> <snip>
>
> extern void writeFlashByte(uint8_t byte, uint16_t handle);
>
> void writeBuf(const uint8_t __memx *from, const uint8_t __memx *to)
> {
> uint16_t handle = ((uint16_t)(((__uint24)to) & 0xFFFFUL));
> writeFlashByte(*from++, handle++);
> }
>
> </snip>
>
> Before:
>
> mov r27,r22
> mov r26,r23
> mov r21,r24
> mov r24,r20
> movw r22,r18
> mov r30,r27
> mov r31,r26
> call __xload_1
> mov r24,r22
> jmp writeFlashByte
>
> After:
>
> push r16
> push r17
> movw r16,r18
> mov r18,r20
> movw r30,r22
> mov r21,r24
> call __xload_1
> mov r24,r22
> movw r22,r16
> pop r17
> pop r16
> jmp writeFlashByte
>
> Looks like worse code, but reload had to choose call saved regs (r16 and r17) for zero_extendpsisi's
> output operand, as we don't allow REG_X (or greater) for SI mode values
> in avr_hard_regno_mode_ok.
>
> Looks reasonable to me - what do you guys say?
I realize Johann is working on eliminating hard regs usage in favor of register
classes/constraints instead, but I think this patch that sets the correct
cost for MEM rtx in non-default address space does nothing "wrong", even
if it is only a tangential fix to the problem.
What do you guys say?
>
> Regards
> Senthil
> >
> > Johann
> >
^ permalink raw reply [flat|nested] 9+ messages in thread