public inbox for gdb-patches@sourceware.org
 help / color / mirror / Atom feed
* [rx sim] add decode cache
@ 2010-07-29 18:42 DJ Delorie
  2010-07-29 19:23 ` Mike Frysinger
  0 siblings, 1 reply; 8+ messages in thread
From: DJ Delorie @ 2010-07-29 18:42 UTC (permalink / raw)
  To: gdb-patches


This patch is a performance improvement for the RX simulator.  It
stores the opcode decode information from libopcodes so that any given
opcode need not be decoded more than once.  This, and some
optimizations in the previous patch, results in the simulator taking
about 60% of the time it used to take to run a given program.  On a
3.2GHz desktop, I get an effective 12.5MHz simulated time with full
cycle stats, 14.5MHz with just cycle tracking, and 16MHz without cycle
tracking.

Committed.

[include/opcode]

	* rx.h (RX_Operand_Type): Add TwoReg.
	(RX_Opcode_ID): Remove ediv and ediv2.

[opcodes]

	* rx-decode.opc (SRR): New.
	(rx_decode_opcode): Use it for movbi and movbir.  Decode NOP2 (mov
	r0,r0) and NOP3 (max r0,r0) special cases.
	* rx-decode.c: Regenerate.
	
[sim/rx]

	* rx.c (decode_cache_base): New.
	(id_names): Remove ediv and edivu.
	(optype_names): Add TwoReg.
	(maybe_get_mem_page): New.
	(rx_get_byte): Call it.
	(get_op): Add TwoReg support.
	(put_op): Likewise.
	(PD, PS, PS2, GD, GS, GS2, DSZ, SSZ, S2SZ, US1, US2, OM): "opcode"
	is a pointer now.
	(DO_RETURN): New.  We use longjmp to return an exception result.
	(decode_opcode): Make opcode a pointer to the decode cache.  Save
	decoded opcode information and re-use.  Call DO_RETURN instead of
	return throughout.  Remove ediv and edivu.
	* mem.c (ptdc): New.  Adds decode cache.
	(rx_mem_ptr): Support it.
	(rx_mem_decode_cache): New.
	* mem.h (enum mem_ptr_action): add MPA_DECODE_CACHE.
	(rx_mem_decode_cache): Declare.
	* gdb-if.c (sim_resume): Add decode_opcode's setjmp logic here...
	* main.c (main): ...and here.  Use a fast loop if neither trace
	nor disassemble is given.
	* cpu.h (RX_MAKE_STEPPED, RX_MAKE_HIT_BREAK, RX_MAKE_EXITED,
	RX_MAKE_STOPPED, RX_EXITED, RX_STOPPED): Adjust so that 0 is not a
	valid code for anything.

Index: include/opcode/rx.h
===================================================================
RCS file: /cvs/src/src/include/opcode/rx.h,v
retrieving revision 1.4
diff -p -U3 -r1.4 rx.h
--- include/opcode/rx.h	28 Jul 2010 21:58:22 -0000	1.4
+++ include/opcode/rx.h	29 Jul 2010 18:35:03 -0000
@@ -47,6 +47,7 @@ typedef enum
   RX_Operand_Predec,	/* [-Rn] */
   RX_Operand_Condition,	/* eq, gtu, etc */
   RX_Operand_Flag,	/* [UIOSZC] */
+  RX_Operand_TwoReg,	/* [Rn + scale*R2] */
 } RX_Operand_Type;
 
 typedef enum
@@ -82,8 +83,6 @@ typedef enum
   RXO_min,	/* d = min(d,s) */
   RXO_emul,	/* d:64 = d:32 * s */
   RXO_emulu,	/* d:64 = d:32 * s (unsigned) */
-  RXO_ediv,	/* d:64 / s; d = quot, d+1 = rem */
-  RXO_edivu,	/* d:64 / s; d = quot, d+1 = rem */
 
   RXO_rolc,	/* d <<= 1 through carry */
   RXO_rorc,	/* d >>= 1 through carry*/
Index: opcodes/rx-decode.opc
===================================================================
RCS file: /cvs/src/src/opcodes/rx-decode.opc,v
retrieving revision 1.4
diff -p -U3 -r1.4 rx-decode.opc
--- opcodes/rx-decode.opc	28 Jul 2010 00:36:46 -0000	1.4
+++ opcodes/rx-decode.opc	29 Jul 2010 18:35:03 -0000
@@ -89,6 +89,7 @@ static int dsp3map[] = { 8, 9, 10, 3, 4,
 
 #define SC(i)       OP (1, RX_Operand_Immediate, 0, i)
 #define SR(r)       OP (1, RX_Operand_Register,  r, 0)
+#define SRR(r)      OP (1, RX_Operand_TwoReg,  r, 0)
 #define SI(r,a)     OP (1, RX_Operand_Indirect,  r, a)
 #define SIs(r,a,s)  OP (1, RX_Operand_Indirect,  r, (a) * SCALE[s])
 #define SD(t,r,s)   rx_disp (1, t, r, bwl[s], ld);
@@ -270,14 +271,21 @@ rx_decode_opcode (unsigned long pc AU,
   ID(mov); sBWL (sz); DIs(dst, d*16+sppp, sz); SC(IMM(1)); F_____;
 
 /** 11sz sd ss rsrc rdst	mov%s	%1, %0 */
-  ID(mov); sBWL(sz); F_____;
-  if ((ss == 3) && (sd != 3))
+  if (ss == 3 && sz == 2 && rsrc == 0 && rdst == 0)
     {
-      SD(ss, rdst, sz); DD(sd, rsrc, sz);
+      ID(nop2);
     }
   else
     {
-      SD(ss, rsrc, sz); DD(sd, rdst, sz);
+      ID(mov); sBWL(sz); F_____;
+      if ((ss == 3) && (sd != 3))
+	{
+	  SD(ss, rdst, sz); DD(sd, rsrc, sz);
+	}
+      else
+	{
+	  SD(ss, rsrc, sz); DD(sd, rdst, sz);
+	}
     }
 
 /** 10sz 1dsp a src b dst	mov%s	%1, %0 */
@@ -287,13 +295,13 @@ rx_decode_opcode (unsigned long pc AU,
   ID(mov); sBWL(sz); DIs(dst, dsp*4+a*2+b, sz); SR(src); F_____;
 
 /** 1111 1110 01sz isrc bsrc rdst	mov%s	[%1, %2], %0 */
-  ID(movbi); sBWL(sz); DR(rdst); SR(isrc); S2R(bsrc); F_____;
+  ID(movbi); sBWL(sz); DR(rdst); SRR(isrc); S2R(bsrc); F_____;
 
 /** 1111 1110 00sz isrc bsrc rdst	mov%s	%0, [%1, %2] */
-  ID(movbir); sBWL(sz); DR(rdst); SR(isrc); S2R(bsrc); F_____;
+  ID(movbir); sBWL(sz); DR(rdst); SRR(isrc); S2R(bsrc); F_____;
 
 /** 1111 1110 11sz isrc bsrc rdst	movu%s	[%1, %2], %0 */
-  ID(movbi); uBWL(sz); DR(rdst); SR(isrc); S2R(bsrc); F_____;
+  ID(movbi); uBWL(sz); DR(rdst); SRR(isrc); S2R(bsrc); F_____;
 
 /** 1111 1101 0010 0p sz rdst rsrc	mov%s	%1, %0 */
   ID(mov); sBWL (sz); SR(rsrc); F_____;
@@ -525,7 +533,14 @@ rx_decode_opcode (unsigned long pc AU,
   ID(max); DR(rdst); SC(IMMex(im));
 
 /** 1111 1100 0001 00ss rsrc rdst	max	%1%S1, %0 */
-  ID(max); SP(ss, rsrc); DR(rdst);
+  if (ss == 3 && rsrc == 0 && rdst == 0)
+    {
+      ID(nop3);
+    }
+  else
+    {
+      ID(max); SP(ss, rsrc); DR(rdst);
+    }
 
 /** 0000 0110 mx10 00ss 0000 0100 rsrc rdst	max	%1%S1, %0 */
   ID(max); SPm(ss, rsrc, mx); DR(rdst);
@@ -681,23 +696,23 @@ rx_decode_opcode (unsigned long pc AU,
 
 
 /** 0000 1dsp			bra.s	%a0 */
-  ID(branch); Scc(RXC_always); DC(pc + dsp3map[dsp]);
+  ID(branch); DC(pc + dsp3map[dsp]);
 
 /** 0010 1110			bra.b	%a0 */
-  ID(branch); Scc(RXC_always); DC(pc + IMMex(1));
+  ID(branch); DC(pc + IMMex(1));
 
 /** 0011 1000			bra.w	%a0 */
-  ID(branch); Scc(RXC_always); DC(pc + IMMex(2));
+  ID(branch); DC(pc + IMMex(2));
 
 /** 0000 0100			bra.a	%a0 */
-  ID(branch); Scc(RXC_always); DC(pc + IMMex(3));
+  ID(branch); DC(pc + IMMex(3));
 
 /** 0111 1111 0100 rsrc		bra.l	%0 */
-  ID(branchrel); Scc(RXC_always); DR(rsrc);
+  ID(branchrel); DR(rsrc);
 
 
 /** 0111 1111 0000 rsrc		jmp	%0 */
-  ID(branch); Scc(RXC_always); DR(rsrc);
+  ID(branch); DR(rsrc);
 
 /** 0111 1111 0001 rsrc		jsr	%0 */
   ID(jsr); DR(rsrc);
Index: sim/rx/cpu.h
===================================================================
RCS file: /cvs/src/src/sim/rx/cpu.h,v
retrieving revision 1.3
diff -p -U3 -r1.3 cpu.h
--- sim/rx/cpu.h	28 Jul 2010 21:58:22 -0000	1.3
+++ sim/rx/cpu.h	29 Jul 2010 18:35:03 -0000
@@ -211,16 +211,16 @@ int condition_true (int cond_id);
    - RX_MAKE_HIT_BREAK is the return code for hitting a breakpoint.
    - RX_MAKE_EXITED (C) is the return code for exiting with status C.
    - RX_MAKE_STOPPED (S) is the return code for stopping on signal S.  */
-#define RX_MAKE_STEPPED()   (0)
-#define RX_MAKE_HIT_BREAK() (1)
-#define RX_MAKE_EXITED(c)   (((int) (c) << 8) + 2)
-#define RX_MAKE_STOPPED(s)  (((int) (s) << 8) + 3)
+#define RX_MAKE_STEPPED()   (1)
+#define RX_MAKE_HIT_BREAK() (2)
+#define RX_MAKE_EXITED(c)   (((int) (c) << 8) + 3)
+#define RX_MAKE_STOPPED(s)  (((int) (s) << 8) + 4)
 
 #define RX_STEPPED(r)       ((r) == RX_MAKE_STEPPED ())
 #define RX_HIT_BREAK(r)     ((r) == RX_MAKE_HIT_BREAK ())
-#define RX_EXITED(r)        (((r) & 0xff) == 2)
+#define RX_EXITED(r)        (((r) & 0xff) == 3)
 #define RX_EXIT_STATUS(r)   ((r) >> 8)
-#define RX_STOPPED(r)       (((r) & 0xff) == 3)
+#define RX_STOPPED(r)       (((r) & 0xff) == 4)
 #define RX_STOP_SIG(r)      ((r) >> 8)
 
 /* The step result for the current step.  Global to allow
Index: sim/rx/gdb-if.c
===================================================================
RCS file: /cvs/src/src/sim/rx/gdb-if.c,v
retrieving revision 1.7
diff -p -U3 -r1.7 gdb-if.c
--- sim/rx/gdb-if.c	7 Jul 2010 23:22:43 -0000	1.7
+++ sim/rx/gdb-if.c	29 Jul 2010 18:35:03 -0000
@@ -733,6 +733,8 @@ handle_step (int rc)
 void
 sim_resume (SIM_DESC sd, int step, int sig_to_deliver)
 {
+  int rc;
+
   check_desc (sd);
 
   if (sig_to_deliver != 0)
@@ -745,7 +747,12 @@ sim_resume (SIM_DESC sd, int step, int s
   execution_error_clear_last_error ();
 
   if (step)
-    handle_step (decode_opcode ());
+    {
+      rc = setjmp (decode_jmp_buf);
+      if (rc == 0)
+	rc = decode_opcode ();
+      handle_step (rc);
+    }
   else
     {
       /* We don't clear 'stop' here, because then we would miss
@@ -762,7 +769,9 @@ sim_resume (SIM_DESC sd, int step, int s
 	      break;
 	    }
 
-	  int rc = decode_opcode ();
+	  rc = setjmp (decode_jmp_buf);
+	  if (rc == 0)
+	    rc = decode_opcode ();
 
 	  if (execution_error_get_last_error () != SIM_ERR_NONE)
 	    {
Index: sim/rx/main.c
===================================================================
RCS file: /cvs/src/src/sim/rx/main.c,v
retrieving revision 1.4
diff -p -U3 -r1.4 main.c
--- sim/rx/main.c	28 Jul 2010 21:58:22 -0000	1.4
+++ sim/rx/main.c	29 Jul 2010 18:35:03 -0000
@@ -94,6 +94,7 @@ main (int argc, char **argv)
   int o;
   int save_trace;
   bfd *prog;
+  int rc;
 
   /* By default, we exit when an execution error occurs.  */
   execution_error_init_standalone ();
@@ -178,33 +179,50 @@ main (int argc, char **argv)
 
   sim_disasm_init (prog);
 
-  while (1)
-    {
-      int rc;
-
-      if (trace)
-	printf ("\n");
+  enable_counting = verbose;
 
-      if (disassemble)
-	sim_disasm_one ();
+  rc = setjmp (decode_jmp_buf);
 
-      enable_counting = verbose;
-      rc = decode_opcode ();
-      enable_counting = 0;
-
-      if (RX_HIT_BREAK (rc))
-	done (1);
-      else if (RX_EXITED (rc))
-	done (RX_EXIT_STATUS (rc));
-      else if (RX_STOPPED (rc))
+  if (rc == 0)
+    {
+      if (!trace && !disassemble)
 	{
-	  if (verbose)
-	    printf("Stopped on signal %d\n", RX_STOP_SIG (rc));
-	  exit(1);
+	  /* This will longjmp to the above if an exception
+	     happens.  */
+	  for (;;)
+	    decode_opcode ();
 	}
       else
-	assert (RX_STEPPED (rc));
+	while (1)
+	  {
+
+	    if (trace)
+	      printf ("\n");
+
+	    if (disassemble)
+	      {
+		enable_counting = 0;
+		sim_disasm_one ();
+		enable_counting = verbose;
+	      }
 
-      trace_register_changes ();
+	    rc = decode_opcode ();
+
+	    if (trace)
+	      trace_register_changes ();
+	  }
+    }
+
+  if (RX_HIT_BREAK (rc))
+    done (1);
+  else if (RX_EXITED (rc))
+    done (RX_EXIT_STATUS (rc));
+  else if (RX_STOPPED (rc))
+    {
+      if (verbose)
+	printf("Stopped on signal %d\n", RX_STOP_SIG (rc));
+      exit(1);
     }
+  done (0);
+  exit (0);
 }
Index: sim/rx/mem.c
===================================================================
RCS file: /cvs/src/src/sim/rx/mem.c,v
retrieving revision 1.3
diff -p -U3 -r1.3 mem.c
--- sim/rx/mem.c	28 Jul 2010 21:58:22 -0000	1.3
+++ sim/rx/mem.c	29 Jul 2010 18:35:03 -0000
@@ -30,6 +30,7 @@ along with this program.  If not, see <h
 #include <stdlib.h>
 #include <string.h>
 
+#include "opcode/rx.h"
 #include "mem.h"
 #include "cpu.h"
 #include "syscalls.h"
@@ -46,6 +47,7 @@ along with this program.  If not, see <h
 
 static unsigned char **pt[L1_LEN];
 static unsigned char **ptr[L1_LEN];
+static RX_Opcode_Decoded ***ptdc[L1_LEN];
 
 /* [ get=0/put=1 ][ byte size ] */
 static unsigned int mem_counters[2][5];
@@ -85,16 +87,16 @@ rx_mem_ptr (unsigned long address, enum 
     {
       pt[pt1] = (unsigned char **) calloc (L2_LEN, sizeof (char **));
       ptr[pt1] = (unsigned char **) calloc (L2_LEN, sizeof (char **));
+      ptdc[pt1] = (RX_Opcode_Decoded ***) calloc (L2_LEN, sizeof (RX_Opcode_Decoded ***));
     }
   if (pt[pt1][pt2] == 0)
     {
       if (action == MPA_READING)
 	execution_error (SIM_ERR_READ_UNWRITTEN_PAGES, address);
 
-      pt[pt1][pt2] = (unsigned char *) malloc (OFF_LEN);
-      memset (pt[pt1][pt2], 0, OFF_LEN);
-      ptr[pt1][pt2] = (unsigned char *) malloc (OFF_LEN);
-      memset (ptr[pt1][pt2], MC_UNINIT, OFF_LEN);
+      pt[pt1][pt2] = (unsigned char *) calloc (OFF_LEN, 1);
+      ptr[pt1][pt2] = (unsigned char *) calloc (OFF_LEN, 1);
+      ptdc[pt1][pt2] = (RX_Opcode_Decoded **) calloc (OFF_LEN, sizeof(RX_Opcode_Decoded *));
     }
   else if (action == MPA_READING
 	   && ptr[pt1][pt2][pto] == MC_UNINIT)
@@ -105,14 +107,28 @@ rx_mem_ptr (unsigned long address, enum 
       if (ptr[pt1][pt2][pto] == MC_PUSHED_PC)
 	execution_error (SIM_ERR_CORRUPT_STACK, address);
       ptr[pt1][pt2][pto] = MC_DATA;
+      if (ptdc[pt1][pt2][pto])
+	{
+	  free (ptdc[pt1][pt2][pto]);
+	  ptdc[pt1][pt2][pto] = NULL;
+	}
     }
 
   if (action == MPA_CONTENT_TYPE)
-    return ptr[pt1][pt2] + pto;
+    return (unsigned char *) (ptr[pt1][pt2] + pto);
+
+  if (action == MPA_DECODE_CACHE)
+    return (unsigned char *) (ptdc[pt1][pt2] + pto);
 
   return pt[pt1][pt2] + pto;
 }
 
+RX_Opcode_Decoded **
+rx_mem_decode_cache (unsigned long address)
+{
+  return (RX_Opcode_Decoded **) rx_mem_ptr (address, MPA_DECODE_CACHE);
+}
+
 static inline int
 is_reserved_address (unsigned int address)
 {
@@ -335,7 +351,9 @@ mem_put_qi (int address, unsigned char v
   COUNT (1, 1);
 }
 
+#ifdef CYCLE_ACCURATE
 static int tpu_base;
+#endif
 
 void
 mem_put_hi (int address, unsigned short value)
Index: sim/rx/mem.h
===================================================================
RCS file: /cvs/src/src/sim/rx/mem.h,v
retrieving revision 1.3
diff -p -U3 -r1.3 mem.h
--- sim/rx/mem.h	28 Jul 2010 21:58:22 -0000	1.3
+++ sim/rx/mem.h	29 Jul 2010 18:35:03 -0000
@@ -29,7 +29,8 @@ enum mem_ptr_action
 {
   MPA_WRITING,
   MPA_READING,
-  MPA_CONTENT_TYPE
+  MPA_CONTENT_TYPE,
+  MPA_DECODE_CACHE
 };
 
 void init_mem (void);
@@ -43,6 +44,9 @@ unsigned long mem_usage_cycles (void);
 #define NONPAGE_MASK (~(PAGE_SIZE-1))
 
 unsigned char *rx_mem_ptr (unsigned long address, enum mem_ptr_action action);
+#ifdef RXC_never
+RX_Opcode_Decoded **rx_mem_decode_cache (unsigned long address);
+#endif
 
 void mem_put_qi (int address, unsigned char value);
 void mem_put_hi (int address, unsigned short value);
Index: sim/rx/rx.c
===================================================================
RCS file: /cvs/src/src/sim/rx/rx.c,v
retrieving revision 1.5
diff -p -U3 -r1.5 rx.c
--- sim/rx/rx.c	28 Jul 2010 21:58:22 -0000	1.5
+++ sim/rx/rx.c	29 Jul 2010 18:35:03 -0000
@@ -65,8 +65,6 @@ static const char * id_names[] = {
   "RXO_min",	/* d = min(d,s) */
   "RXO_emul",	/* d:64 = d:32 * s */
   "RXO_emulu",	/* d:64 = d:32 * s (unsigned) */
-  "RXO_ediv",	/* d:64 / s; d = quot, d+1 = rem */
-  "RXO_edivu",	/* d:64 / s; d = quot, d+1 = rem */
 
   "RXO_rolc",	/* d <<= 1 through carry */
   "RXO_rorc",	/* d >>= 1 through carry*/
@@ -145,7 +143,8 @@ static const char * optype_names[] = {
   "Ps++",	/* [Rn+] */
   "--Pr",	/* [-Rn] */
   " cc ",	/* eq, gtu, etc */
-  "Flag"	/* [UIOSZC] */
+  "Flag",	/* [UIOSZC] */
+  "RbRi"	/* [Rb + scale * Ri] */
 };
 
 #define N_RXO (sizeof(id_names)/sizeof(id_names[0]))
@@ -296,8 +295,20 @@ _rx_abort (const char *file, int line)
 }
 
 static unsigned char *get_byte_base;
+static RX_Opcode_Decoded **decode_cache_base;
 static SI get_byte_page;
 
+static inline void
+maybe_get_mem_page (SI tpc)
+{
+  if (((tpc ^ get_byte_page) & NONPAGE_MASK) || enable_counting)
+    {
+      get_byte_page = tpc & NONPAGE_MASK;
+      get_byte_base = rx_mem_ptr (get_byte_page, MPA_READING) - get_byte_page;
+      decode_cache_base = rx_mem_decode_cache (get_byte_page) - get_byte_page;
+    }
+}
+
 /* This gets called a *lot* so optimize it.  */
 static int
 rx_get_byte (void *vdata)
@@ -309,20 +320,16 @@ rx_get_byte (void *vdata)
   if (rx_big_endian)
     tpc ^= 3;
 
-  if (((tpc ^ get_byte_page) & NONPAGE_MASK) || enable_counting)
-    {
-      get_byte_page = tpc & NONPAGE_MASK;
-      get_byte_base = rx_mem_ptr (get_byte_page, MPA_READING) - get_byte_page;
-    }
+  maybe_get_mem_page (tpc);
 
   rx_data->dpc ++;
   return get_byte_base [tpc];
 }
 
 static int
-get_op (RX_Opcode_Decoded *rd, int i)
+get_op (const RX_Opcode_Decoded *rd, int i)
 {
-  RX_Opcode_Operand *o = rd->op + i;
+  const RX_Opcode_Operand *o = rd->op + i;
   int addr, rv = 0;
 
   switch (o->type)
@@ -343,8 +350,11 @@ get_op (RX_Opcode_Decoded *rd, int i)
       /* fall through */
     case RX_Operand_Postinc:	/* [Rn+] */
     case RX_Operand_Indirect:	/* [Rn + addend] */
+    case RX_Operand_TwoReg:	/* [Rn + scale * R2] */
 #ifdef CYCLE_ACCURATE
       RL (o->reg);
+      if (o->type == RX_Operand_TwoReg)
+	RL (rd->op[2].reg);
       regs.rt = -1;
       if (regs.m2m == M2M_BOTH)
 	{
@@ -359,7 +369,11 @@ get_op (RX_Opcode_Decoded *rd, int i)
       memory_source = 1;
 #endif
 
-      addr = get_reg (o->reg) + o->addend;
+      if (o->type == RX_Operand_TwoReg)
+	addr = get_reg (o->reg) * size2bytes[rd->size] + get_reg (rd->op[2].reg);
+      else
+	addr = get_reg (o->reg) + o->addend;
+
       switch (o->size)
 	{
 	case RX_AnySize:
@@ -440,9 +454,9 @@ get_op (RX_Opcode_Decoded *rd, int i)
 }
 
 static void
-put_op (RX_Opcode_Decoded *rd, int i, int v)
+put_op (const RX_Opcode_Decoded *rd, int i, int v)
 {
-  RX_Opcode_Operand *o = rd->op + i;
+  const RX_Opcode_Operand *o = rd->op + i;
   int addr;
 
   switch (o->size)
@@ -504,6 +518,7 @@ put_op (RX_Opcode_Decoded *rd, int i, in
       /* fall through */
     case RX_Operand_Postinc:	/* [Rn+] */
     case RX_Operand_Indirect:	/* [Rn + addend] */
+    case RX_Operand_TwoReg:	/* [Rn + scale * R2] */
 
 #ifdef CYCLE_ACCURATE
       if (regs.m2m == M2M_BOTH)
@@ -518,7 +533,11 @@ put_op (RX_Opcode_Decoded *rd, int i, in
       memory_dest = 1;
 #endif
 
-      addr = get_reg (o->reg) + o->addend;
+      if (o->type == RX_Operand_TwoReg)
+	addr = get_reg (o->reg) * size2bytes[rd->size] + get_reg (rd->op[2].reg);
+      else
+	addr = get_reg (o->reg) + o->addend;
+
       switch (o->size)
 	{
 	case RX_AnySize:
@@ -559,19 +578,19 @@ put_op (RX_Opcode_Decoded *rd, int i, in
     }
 }
 
-#define PD(x) put_op (&opcode, 0, x)
-#define PS(x) put_op (&opcode, 1, x)
-#define PS2(x) put_op (&opcode, 2, x)
-#define GD() get_op (&opcode, 0)
-#define GS() get_op (&opcode, 1)
-#define GS2() get_op (&opcode, 2)
-#define DSZ() size2bytes[opcode.op[0].size]
-#define SSZ() size2bytes[opcode.op[0].size]
-#define S2SZ() size2bytes[opcode.op[0].size]
+#define PD(x) put_op (opcode, 0, x)
+#define PS(x) put_op (opcode, 1, x)
+#define PS2(x) put_op (opcode, 2, x)
+#define GD() get_op (opcode, 0)
+#define GS() get_op (opcode, 1)
+#define GS2() get_op (opcode, 2)
+#define DSZ() size2bytes[opcode->op[0].size]
+#define SSZ() size2bytes[opcode->op[0].size]
+#define S2SZ() size2bytes[opcode->op[0].size]
 
 /* "Universal" sources.  */
-#define US1() ((opcode.op[2].type == RX_Operand_None) ? GD() : GS())
-#define US2() ((opcode.op[2].type == RX_Operand_None) ? GS() : GS2())
+#define US1() ((opcode->op[2].type == RX_Operand_None) ? GD() : GS())
+#define US2() ((opcode->op[2].type == RX_Operand_None) ? GS() : GS2())
 
 static void
 push(int val)
@@ -828,7 +847,7 @@ do_fp_exception (unsigned long opcode_pc
 }
 
 static int
-op_is_memory (RX_Opcode_Decoded *rd, int i)
+op_is_memory (const RX_Opcode_Decoded *rd, int i)
 {
   switch (rd->op[i].type)
     {
@@ -840,7 +859,9 @@ op_is_memory (RX_Opcode_Decoded *rd, int
       return 0;
     }
 }
-#define OM(i) op_is_memory (&opcode, i)
+#define OM(i) op_is_memory (opcode, i)
+
+#define DO_RETURN(x) { longjmp (decode_jmp_buf, x); }
 
 int
 decode_opcode ()
@@ -852,8 +873,7 @@ decode_opcode ()
   long long sll;
   unsigned long opcode_pc;
   RX_Data rx_data;
-  RX_Opcode_Decoded opcode;
-  int rv;
+  const RX_Opcode_Decoded *opcode;
 #ifdef CYCLE_STATS
   unsigned long long prev_cycle_count;
 #endif
@@ -861,9 +881,6 @@ decode_opcode ()
   int tx;
 #endif
 
-  if ((rv = setjmp (decode_jmp_buf)))
-    return rv;
-
 #ifdef CYCLE_STATS
   prev_cycle_count = regs.cycle_count;
 #endif
@@ -875,9 +892,25 @@ decode_opcode ()
 
   rx_cycles ++;
 
-  rx_data.dpc = opcode_pc = regs.r_pc;
-  memset (&opcode, 0, sizeof(opcode));
-  opcode_size = rx_decode_opcode (opcode_pc, &opcode, rx_get_byte, &rx_data);
+  maybe_get_mem_page (regs.r_pc);
+
+  opcode_pc = regs.r_pc;
+
+  /* Note that we don't word-swap this point, there's no point.  */
+  if (decode_cache_base[opcode_pc] == NULL)
+    {
+      RX_Opcode_Decoded *opcode_w;
+      rx_data.dpc = opcode_pc;
+      opcode_w = decode_cache_base[opcode_pc] = calloc (1, sizeof (RX_Opcode_Decoded));
+      opcode_size = rx_decode_opcode (opcode_pc, opcode_w,
+				      rx_get_byte, &rx_data);
+      opcode = opcode_w;
+    }
+  else
+    {
+      opcode = decode_cache_base[opcode_pc];
+      opcode_size = opcode->n_bytes;
+    }
 
 #ifdef CYCLE_ACCURATE
   if (branch_alignment_penalty)
@@ -896,11 +929,11 @@ decode_opcode ()
 
   regs.r_pc += opcode_size;
 
-  rx_flagmask = opcode.flags_s;
-  rx_flagand = ~(int)opcode.flags_0;
-  rx_flagor = opcode.flags_1;
+  rx_flagmask = opcode->flags_s;
+  rx_flagand = ~(int)opcode->flags_0;
+  rx_flagor = opcode->flags_1;
 
-  switch (opcode.id)
+  switch (opcode->id)
     {
     case RXO_abs:
       sll = GS ();
@@ -928,7 +961,7 @@ decode_opcode ()
     case RXO_bclr:
       ma = GD ();
       mb = GS ();
-      if (opcode.op[0].type == RX_Operand_Register)
+      if (opcode->op[0].type == RX_Operand_Register)
 	mb &= 0x1f;
       else
 	mb &= 0x07;
@@ -940,7 +973,7 @@ decode_opcode ()
     case RXO_bmcc:
       ma = GD ();
       mb = GS ();
-      if (opcode.op[0].type == RX_Operand_Register)
+      if (opcode->op[0].type == RX_Operand_Register)
 	mb &= 0x1f;
       else
 	mb &= 0x07;
@@ -955,7 +988,7 @@ decode_opcode ()
     case RXO_bnot:
       ma = GD ();
       mb = GS ();
-      if (opcode.op[0].type == RX_Operand_Register)
+      if (opcode->op[0].type == RX_Operand_Register)
 	mb &= 0x1f;
       else
 	mb &= 0x07;
@@ -965,7 +998,7 @@ decode_opcode ()
       break;
 
     case RXO_branch:
-      if (GS())
+      if (opcode->op[1].type == RX_Operand_None || GS())
 	{
 #ifdef CYCLE_ACCURATE
 	  SI old_pc = regs.r_pc;
@@ -987,9 +1020,6 @@ decode_opcode ()
 	    }
 #ifdef CYCLE_STATS
 	  branch_stalls ++;
-	  /* This is just for statistics */
-	  if (opcode.op[1].reg == 14)
-	    opcode.op[1].type = RX_Operand_None;
 #endif
 #endif
 	}
@@ -1032,11 +1062,11 @@ decode_opcode ()
       {
 	int old_psw = regs.r_psw;
 	if (rx_in_gdb)
-	  return RX_MAKE_HIT_BREAK ();
+	  DO_RETURN (RX_MAKE_HIT_BREAK ());
 	if (regs.r_intb == 0)
 	  {
 	    tprintf("BREAK hit, no vector table.\n");
-	    return RX_MAKE_EXITED(1);
+	    DO_RETURN (RX_MAKE_EXITED(1));
 	  }
 	regs.r_psw &= ~(FLAGBIT_I | FLAGBIT_U | FLAGBIT_PM);
 	pushpc (old_psw);
@@ -1049,7 +1079,7 @@ decode_opcode ()
     case RXO_bset:
       ma = GD ();
       mb = GS ();
-      if (opcode.op[0].type == RX_Operand_Register)
+      if (opcode->op[0].type == RX_Operand_Register)
 	mb &= 0x1f;
       else
 	mb &= 0x07;
@@ -1061,7 +1091,7 @@ decode_opcode ()
     case RXO_btst:
       ma = GS ();
       mb = GS2 ();
-      if (opcode.op[1].type == RX_Operand_Register)
+      if (opcode->op[1].type == RX_Operand_Register)
 	mb &= 0x1f;
       else
 	mb &= 0x07;
@@ -1071,7 +1101,7 @@ decode_opcode ()
       break;
 
     case RXO_clrpsw:
-      v = 1 << opcode.op[0].reg;
+      v = 1 << opcode->op[0].reg;
       if (FLAG_PM
 	  && (v == FLAGBIT_I
 	      || v == FLAGBIT_U))
@@ -1120,60 +1150,13 @@ decode_opcode ()
       cycles (20);
       break;
 
-    case RXO_ediv:
-      ma = GS();
-      mb = GD();
-      tprintf("%d / %d = ", mb, ma);
-      if (ma == 0 || (ma == -1 && (unsigned int) mb == 0x80000000))
-	{
-	  tprintf("#NAN\n");
-	  set_flags (FLAGBIT_O, FLAGBIT_O);
-	}
-      else
-	{
-	  v = mb/ma;
-	  mb = mb%ma;
-	  tprintf("%d, rem %d\n", v, mb);
-	  set_flags (FLAGBIT_O, 0);
-	  PD (v);
-	  opcode.op[0].reg ++;
-	  PD (mb);
-	}
-      /* Note: spec says 3 to 22 cycles, we are pessimistic.  */
-      cycles (22);
-      break;
-
-    case RXO_edivu:
-      uma = GS();
-      umb = GD();
-      tprintf("%u / %u = ", umb, uma);
-      if (uma == 0)
-	{
-	  tprintf("#NAN\n");
-	  set_flags (FLAGBIT_O, FLAGBIT_O);
-	}
-      else
-	{
-	  v = umb/uma;
-	  umb = umb%uma;
-	  tprintf("%u, rem %u\n", v, umb);
-	  set_flags (FLAGBIT_O, 0);
-	  PD (v);
-	  opcode.op[0].reg ++;
-	  PD (umb);
-	}
-      /* Note: spec says 2 to 20 cycles, we are pessimistic.  */
-      cycles (20);
-      break;
-
     case RXO_emul:
       ma = GD ();
       mb = GS ();
       sll = (long long)ma * (long long)mb;
       tprintf("%d * %d = %lld\n", ma, mb, sll);
-      PD (sll);
-      opcode.op[0].reg ++;
-      PD (sll >> 32);
+      put_reg (opcode->op[0].reg, sll);
+      put_reg (opcode->op[0].reg + 1, sll >> 32);
       E2;
       break;
 
@@ -1182,9 +1165,8 @@ decode_opcode ()
       umb = GS ();
       ll = (long long)uma * (long long)umb;
       tprintf("%#x * %#x = %#llx\n", uma, umb, ll);
-      PD (ll);
-      opcode.op[0].reg ++;
-      PD (ll >> 32);
+      put_reg (opcode->op[0].reg, ll);
+      put_reg (opcode->op[0].reg + 1, ll >> 32);
       E2;
       break;
 
@@ -1242,7 +1224,7 @@ decode_opcode ()
       v = GS ();
       if (v == 255)
 	{
-	  return rx_syscall (regs.r[5]);
+	  DO_RETURN (rx_syscall (regs.r[5]));
 	}
       else
 	{
@@ -1278,7 +1260,7 @@ decode_opcode ()
 	regs.link_register = regs.r_pc;
 #endif
 	pushpc (get_reg (pc));
-	if (opcode.id == RXO_jsrrel)
+	if (opcode->id == RXO_jsrrel)
 	  v += regs.r_pc;
 #ifdef CYCLE_ACCURATE
 	delta = v - regs.r_pc;
@@ -1323,12 +1305,6 @@ decode_opcode ()
       else
 	PD (mb);
       E (1);
-#ifdef CYCLE_STATS
-      if (opcode.op[0].type == RX_Operand_Register
-	  && opcode.op[1].type == RX_Operand_Register
-	  && opcode.op[0].reg == opcode.op[1].reg)
-	opcode.id = RXO_nop3;
-#endif
       break;
 
     case RXO_min:
@@ -1344,8 +1320,8 @@ decode_opcode ()
     case RXO_mov:
       v = GS ();
 
-      if (opcode.op[0].type == RX_Operand_Register
-	  && opcode.op[0].reg == 16 /* PSW */)
+      if (opcode->op[0].type == RX_Operand_Register
+	  && opcode->op[0].reg == 16 /* PSW */)
 	{
 	  /* Special case, LDC and POPC can't ever modify PM.  */
 	  int pm = regs.r_psw & FLAGBIT_PM;
@@ -1360,16 +1336,16 @@ decode_opcode ()
       if (FLAG_PM)
 	{
 	  /* various things can't be changed in user mode.  */
-	  if (opcode.op[0].type == RX_Operand_Register)
-	    if (opcode.op[0].reg == 32)
+	  if (opcode->op[0].type == RX_Operand_Register)
+	    if (opcode->op[0].reg == 32)
 	      {
 		v &= ~ (FLAGBIT_I | FLAGBIT_U | FLAGBITS_IPL);
 		v |= regs.r_psw & (FLAGBIT_I | FLAGBIT_U | FLAGBITS_IPL);
 	      }
-	  if (opcode.op[0].reg == 34 /* ISP */
-	      || opcode.op[0].reg == 37 /* BPSW */
-	      || opcode.op[0].reg == 39 /* INTB */
-	      || opcode.op[0].reg == 38 /* VCT */)
+	  if (opcode->op[0].reg == 34 /* ISP */
+	      || opcode->op[0].reg == 37 /* BPSW */
+	      || opcode->op[0].reg == 39 /* INTB */
+	      || opcode->op[0].reg == 38 /* VCT */)
 	    /* These are ignored.  */
 	    break;
 	}
@@ -1381,10 +1357,10 @@ decode_opcode ()
       PD (v);
 
 #ifdef CYCLE_ACCURATE
-      if ((opcode.op[0].type == RX_Operand_Predec
-	   && opcode.op[1].type == RX_Operand_Register)
-	  || (opcode.op[0].type == RX_Operand_Postinc
-	      && opcode.op[1].type == RX_Operand_Register))
+      if ((opcode->op[0].type == RX_Operand_Predec
+	   && opcode->op[1].type == RX_Operand_Register)
+	  || (opcode->op[0].type == RX_Operand_Postinc
+	      && opcode->op[1].type == RX_Operand_Register))
 	{
 	  /* Special case: push reg doesn't cause a memory stall.  */
 	  memory_dest = 0;
@@ -1393,32 +1369,14 @@ decode_opcode ()
 #endif
 
       set_sz (v, DSZ());
-#ifdef CYCLE_STATS
-      if (opcode.op[0].type == RX_Operand_Register
-	  && opcode.op[1].type == RX_Operand_Register
-	  && opcode.op[0].reg == opcode.op[1].reg)
-	opcode.id = RXO_nop2;
-#endif
       break;
 
     case RXO_movbi:
-      /* We cheat to save on code duplication. */
-      regs.r_temp = (get_reg (opcode.op[1].reg) * size2bytes[opcode.size]
-		     + get_reg (opcode.op[2].reg));
-      opcode.op[1].reg = r_temp_idx;
-      opcode.op[1].type = RX_Operand_Indirect;
-      opcode.op[1].addend = 0;
       PD (GS ());
       cycles (1);
       break;
 
     case RXO_movbir:
-      /* We cheat to save on code duplication. */
-      regs.r_temp = (get_reg (opcode.op[1].reg) * size2bytes[opcode.size]
-		     + get_reg (opcode.op[2].reg));
-      opcode.op[1].reg = r_temp_idx;
-      opcode.op[1].type = RX_Operand_Indirect;
-      opcode.op[1].addend = 0;
       PS (GD ());
       cycles (1);
       break;
@@ -1478,6 +1436,8 @@ decode_opcode ()
       break;
 
     case RXO_nop:
+    case RXO_nop2:
+    case RXO_nop3:
       E1;
       break;
 
@@ -1487,14 +1447,14 @@ decode_opcode ()
 
     case RXO_popm:
       /* POPM cannot pop R0 (sp).  */
-      if (opcode.op[1].reg == 0 || opcode.op[2].reg == 0)
+      if (opcode->op[1].reg == 0 || opcode->op[2].reg == 0)
 	EXCEPTION (EX_UNDEFINED);
-      if (opcode.op[1].reg >= opcode.op[2].reg)
+      if (opcode->op[1].reg >= opcode->op[2].reg)
 	{
 	  regs.r_pc = opcode_pc;
-	  return RX_MAKE_STOPPED (SIGILL);
+	  DO_RETURN (RX_MAKE_STOPPED (SIGILL));
 	}
-      for (v = opcode.op[1].reg; v <= opcode.op[2].reg; v++)
+      for (v = opcode->op[1].reg; v <= opcode->op[2].reg; v++)
 	{
 	  cycles (1);
 	  RLD (v);
@@ -1504,19 +1464,19 @@ decode_opcode ()
 
     case RXO_pushm:
       /* PUSHM cannot push R0 (sp).  */
-      if (opcode.op[1].reg == 0 || opcode.op[2].reg == 0)
+      if (opcode->op[1].reg == 0 || opcode->op[2].reg == 0)
 	EXCEPTION (EX_UNDEFINED);
-      if (opcode.op[1].reg >= opcode.op[2].reg)
+      if (opcode->op[1].reg >= opcode->op[2].reg)
 	{
 	  regs.r_pc = opcode_pc;
 	  return RX_MAKE_STOPPED (SIGILL);
 	}
-      for (v = opcode.op[2].reg; v >= opcode.op[1].reg; v--)
+      for (v = opcode->op[2].reg; v >= opcode->op[1].reg; v--)
 	{
 	  RL (v);
 	  push (get_reg (v));
 	}
-      cycles (opcode.op[2].reg - opcode.op[1].reg + 1);
+      cycles (opcode->op[2].reg - opcode->op[1].reg + 1);
       break;
 
     case RXO_racw:
@@ -1573,7 +1533,7 @@ decode_opcode ()
 	{
 	  long long tmp;
 
-	  switch (opcode.size)
+	  switch (opcode->size)
 	    {
 	    case RX_Long:
 	      ma = mem_get_si (regs.r[1]);
@@ -1627,7 +1587,7 @@ decode_opcode ()
       else
 	set_flags (FLAGBIT_O|FLAGBIT_S, ma);
 #ifdef CYCLE_ACCURATE
-      switch (opcode.size)
+      switch (opcode->size)
 	{
 	case RX_Long:
 	  cycles (6 + 4 * tx);
@@ -1725,17 +1685,17 @@ decode_opcode ()
       break;
 
     case RXO_rtsd:
-      if (opcode.op[2].type == RX_Operand_Register)
+      if (opcode->op[2].type == RX_Operand_Register)
 	{
 	  int i;
 	  /* RTSD cannot pop R0 (sp).  */
-	  put_reg (0, get_reg (0) + GS() - (opcode.op[0].reg-opcode.op[2].reg+1)*4);
-	  if (opcode.op[2].reg == 0)
+	  put_reg (0, get_reg (0) + GS() - (opcode->op[0].reg-opcode->op[2].reg+1)*4);
+	  if (opcode->op[2].reg == 0)
 	    EXCEPTION (EX_UNDEFINED);
 #ifdef CYCLE_ACCURATE
-	  tx = opcode.op[0].reg - opcode.op[2].reg + 1;
+	  tx = opcode->op[0].reg - opcode->op[2].reg + 1;
 #endif
-	  for (i = opcode.op[2].reg; i <= opcode.op[0].reg; i ++)
+	  for (i = opcode->op[2].reg; i <= opcode->op[0].reg; i ++)
 	    {
 	      RLD (i);
 	      put_reg (i, pop ());
@@ -1807,7 +1767,7 @@ decode_opcode ()
       break;
 
     case RXO_setpsw:
-      v = 1 << opcode.op[0].reg;
+      v = 1 << opcode->op[0].reg;
       if (FLAG_PM
 	  && (v == FLAGBIT_I
 	      || v == FLAGBIT_U))
@@ -1880,7 +1840,7 @@ decode_opcode ()
 #ifdef CYCLE_ACCURATE
       tx = regs.r[3];
 #endif
-      switch (opcode.size)
+      switch (opcode->size)
 	{
 	case RX_Long:
 	  while (regs.r[3] != 0)
@@ -1923,7 +1883,7 @@ decode_opcode ()
     case RXO_stop:
       PRIVILEDGED ();
       regs.r_psw |= FLAGBIT_I;
-      return RX_MAKE_STOPPED(0);
+      DO_RETURN (RX_MAKE_STOPPED(0));
 
     case RXO_sub:
       MATH_OP (-, 0);
@@ -1939,7 +1899,7 @@ decode_opcode ()
 	  cycles (3);
 	  break;
 	}
-      switch (opcode.size)
+      switch (opcode->size)
 	{
 	case RX_Long:
 	  uma = get_reg (2);
@@ -1993,7 +1953,7 @@ decode_opcode ()
 #endif
       if (regs.r[3] == 0)
 	break;
-      switch (opcode.size)
+      switch (opcode->size)
 	{
 	case RX_Long:
 	  uma = get_reg (2);
@@ -2043,7 +2003,7 @@ decode_opcode ()
     case RXO_wait:
       PRIVILEDGED ();
       regs.r_psw |= FLAGBIT_I;
-      return RX_MAKE_STOPPED(0);
+      DO_RETURN (RX_MAKE_STOPPED(0));
 
     case RXO_xchg:
 #ifdef CYCLE_ACCURATE
@@ -2082,7 +2042,7 @@ decode_opcode ()
 #ifdef CYCLE_STATS
   if (prev_cycle_count == regs.cycle_count)
     {
-      printf("Cycle count not updated! id %s\n", id_names[opcode.id]);
+      printf("Cycle count not updated! id %s\n", id_names[opcode->id]);
       abort ();
     }
 #endif
@@ -2090,15 +2050,15 @@ decode_opcode ()
 #ifdef CYCLE_STATS
   if (running_benchmark)
     {
-      int omap = op_lookup (opcode.op[0].type, opcode.op[1].type, opcode.op[2].type);
+      int omap = op_lookup (opcode->op[0].type, opcode->op[1].type, opcode->op[2].type);
 
 
-      cycles_per_id[opcode.id][omap] += regs.cycle_count - prev_cycle_count;
-      times_per_id[opcode.id][omap] ++;
+      cycles_per_id[opcode->id][omap] += regs.cycle_count - prev_cycle_count;
+      times_per_id[opcode->id][omap] ++;
 
-      times_per_pair[prev_opcode_id][po0][opcode.id][omap] ++;
+      times_per_pair[prev_opcode_id][po0][opcode->id][omap] ++;
 
-      prev_opcode_id = opcode.id;
+      prev_opcode_id = opcode->id;
       po0 = omap;
     }
 #endif

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [rx sim] add decode cache
  2010-07-29 18:42 [rx sim] add decode cache DJ Delorie
@ 2010-07-29 19:23 ` Mike Frysinger
  2010-07-29 19:34   ` DJ Delorie
  0 siblings, 1 reply; 8+ messages in thread
From: Mike Frysinger @ 2010-07-29 19:23 UTC (permalink / raw)
  To: gdb-patches; +Cc: DJ Delorie

[-- Attachment #1: Type: Text/Plain, Size: 1075 bytes --]

On Thursday, July 29, 2010 14:41:38 DJ Delorie wrote:
> This patch is a performance improvement for the RX simulator.  It
> stores the opcode decode information from libopcodes so that any given
> opcode need not be decoded more than once.  This, and some
> optimizations in the previous patch, results in the simulator taking
> about 60% of the time it used to take to run a given program.  On a
> 3.2GHz desktop, I get an effective 12.5MHz simulated time with full
> cycle stats, 14.5MHz with just cycle tracking, and 16MHz without cycle
> tracking.

if the rx sim is ultimately built on top of opc2c, and you're caching the 
results of that, then shouldnt it be possible to keep the cache in a generic 
place where everyone using opc2c would be able to leverage it ?

also, on a semi-related note, i cant find any documentation or info in the 
archives on opc2c.  it seems to have been quietly merged with the rx port and 
not really given any public info.  looking at the rx opc file, it seems like 
it'd be useful to port some peeps over to it.
-mike

[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [rx sim] add decode cache
  2010-07-29 19:23 ` Mike Frysinger
@ 2010-07-29 19:34   ` DJ Delorie
  2010-07-29 21:42     ` Mike Frysinger
  0 siblings, 1 reply; 8+ messages in thread
From: DJ Delorie @ 2010-07-29 19:34 UTC (permalink / raw)
  To: Mike Frysinger; +Cc: gdb-patches


> if the rx sim is ultimately built on top of opc2c, and you're
> caching the results of that, then shouldnt it be possible to keep
> the cache in a generic place where everyone using opc2c would be
> able to leverage it ?

Well, that depends on the memory management scheme that the simulator
is using.  In the RX case, it's using page tables, so when a page of
memory is allocated, the corresponding page of decode pointers is
allocated as well.  Technically, it's not a "cache" in that sense, as
we store *all* decodes, not just the LRU ones.

Plus, the information that's cached isn't the decode logic that opc2c
produces, it's the semantic logic that rx-decode.opc adds on top of
that.  Look at sim/m32c/m32c.opc for an alternate example.  The decode
is the same, but the semantics are completely different.

> also, on a semi-related note, i cant find any documentation or info
> in the archives on opc2c.  it seems to have been quietly merged with
> the rx port and not really given any public info.  looking at the rx
> opc file, it seems like it'd be useful to port some peeps over to
> it.

I agree.  I'm working on porting the m32c opcodes over to it, but so
many other things are higher on my priority list...

opc2c was originally used in my m32c simulator, but was undocumented
there too.  It's not RX-specific (not with two chips behind it) but it
is designed to decode CISC architectures, like the m32c, RX, or i386.
Specifically, cases where you don't know the length of the opcode
until you've started decoding it.

All it does, however, is build the decode logic and pull out the
operands.  It doesn't decode the semantics at all; it's just a way of
building a complex if/switch tree from some comments.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [rx sim] add decode cache
  2010-07-29 19:34   ` DJ Delorie
@ 2010-07-29 21:42     ` Mike Frysinger
  2010-07-29 22:00       ` DJ Delorie
  0 siblings, 1 reply; 8+ messages in thread
From: Mike Frysinger @ 2010-07-29 21:42 UTC (permalink / raw)
  To: DJ Delorie; +Cc: gdb-patches

[-- Attachment #1: Type: Text/Plain, Size: 2342 bytes --]

On Thursday, July 29, 2010 15:33:53 DJ Delorie wrote:
> > if the rx sim is ultimately built on top of opc2c, and you're
> > caching the results of that, then shouldnt it be possible to keep
> > the cache in a generic place where everyone using opc2c would be
> > able to leverage it ?
> 
> Well, that depends on the memory management scheme that the simulator
> is using.  In the RX case, it's using page tables, so when a page of
> memory is allocated, the corresponding page of decode pointers is
> allocated as well.  Technically, it's not a "cache" in that sense, as
> we store *all* decodes, not just the LRU ones.
> 
> Plus, the information that's cached isn't the decode logic that opc2c
> produces, it's the semantic logic that rx-decode.opc adds on top of
> that.  Look at sim/m32c/m32c.opc for an alternate example.  The decode
> is the same, but the semantics are completely different.

ok, so the cached info isnt as generic as i'd like ;).  i wonder if we could 
fit a cache in there somewhere though ...

> > also, on a semi-related note, i cant find any documentation or info
> > in the archives on opc2c.  it seems to have been quietly merged with
> > the rx port and not really given any public info.  looking at the rx
> > opc file, it seems like it'd be useful to port some peeps over to
> > it.
> 
> I agree.  I'm working on porting the m32c opcodes over to it, but so
> many other things are higher on my priority list...
> 
> opc2c was originally used in my m32c simulator, but was undocumented
> there too.  It's not RX-specific (not with two chips behind it) but it
> is designed to decode CISC architectures, like the m32c, RX, or i386.
> Specifically, cases where you don't know the length of the opcode
> until you've started decoding it.
> 
> All it does, however, is build the decode logic and pull out the
> operands.  It doesn't decode the semantics at all; it's just a way of
> building a complex if/switch tree from some comments.

doesnt seem like it's limited to CISC arches though ... in the Blackfin 
decode/sim, we too have a big tree of if/switch/masks to pull out arguments.  
ive always been annoyed that we had to copy the decode file, gut it, and then 
fill in the sim pieces to make it work.  seems like this opc2c might be a way 
back from that.
-mike

[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [rx sim] add decode cache
  2010-07-29 21:42     ` Mike Frysinger
@ 2010-07-29 22:00       ` DJ Delorie
  2010-07-30  0:09         ` Mike Frysinger
  2010-07-30  1:04         ` Kevin Buettner
  0 siblings, 2 replies; 8+ messages in thread
From: DJ Delorie @ 2010-07-29 22:00 UTC (permalink / raw)
  To: Mike Frysinger; +Cc: gdb-patches, kevinb


> ok, so the cached info isnt as generic as i'd like ;).  i wonder if
> we could fit a cache in there somewhere though ...

I don't think the decode is as cpu-intensive as the semantics, though.
It seems to me there are a *lot* of loops in most software, so the
more info you can re-use, the better.  Actually, decoding a single
opcode's syntax and semantics doesn't take that long, it's just that
benchmarks tend to run *zillions* of opcodes, so even tiny savings add
up.

> doesnt seem like it's limited to CISC arches though ... in the
> Blackfin decode/sim, we too have a big tree of if/switch/masks to
> pull out arguments ive always been annoyed that we had to copy the
> decode file, gut it, and then fill in the sim pieces to make it
> work.  seems like this opc2c might be a way back from that.

There's no reason why it *wouldn't* work for RISC architectures, of
course, I just never tried it, and don't know how optimal it would be
with it.  However, if you have a RISC case where an operand field
isn't fully used, and certain operand patterns mean a whole different
opcode, opc2c can help you there - it will only decode to a specific
opcode if its operands are valid too, which is *really* hard to get
right with simple mask tables.

For the m32c, I used opc2c for the simulator, and cgen was used for
everything else.  For RX, opc2c was pushed to libopcodes, and it's
used for the simulator, disassembler, *and* gdb.  Nowhere else are RX
opcodes decoded.  Maybe Kevin can comment on how different it was to
use opc2c's decoder for gdb's prolog analyzer?

For the RX assembler, I used bison.  The resulting file *looks* like
an opc2c input file - syntax followed by semantics - but I didn't try
to use the same input file for both purposes.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [rx sim] add decode cache
  2010-07-29 22:00       ` DJ Delorie
@ 2010-07-30  0:09         ` Mike Frysinger
  2010-07-30  0:17           ` DJ Delorie
  2010-07-30  1:04         ` Kevin Buettner
  1 sibling, 1 reply; 8+ messages in thread
From: Mike Frysinger @ 2010-07-30  0:09 UTC (permalink / raw)
  To: DJ Delorie; +Cc: gdb-patches, kevinb

[-- Attachment #1: Type: Text/Plain, Size: 3003 bytes --]

On Thursday, July 29, 2010 18:00:29 DJ Delorie wrote:
> > ok, so the cached info isnt as generic as i'd like ;).  i wonder if
> > we could fit a cache in there somewhere though ...
> 
> I don't think the decode is as cpu-intensive as the semantics, though.
> It seems to me there are a *lot* of loops in most software, so the
> more info you can re-use, the better.  Actually, decoding a single
> opcode's syntax and semantics doesn't take that long, it's just that
> benchmarks tend to run *zillions* of opcodes, so even tiny savings add
> up.

right ... i'm booting the Linux kernel all the way to userspace with the 
Blackfin sim.  and then running some benchmarks in that.  the faster i can 
make this the happier i'll be :).

sometimes i go even crazier and boot U-Boot in the sim, load a uImage over the 
simulated network from a real host (via tun/tap), and then boot the kernel 
that way.  decompression is the worst part.

> > doesnt seem like it's limited to CISC arches though ... in the
> > Blackfin decode/sim, we too have a big tree of if/switch/masks to
> > pull out arguments ive always been annoyed that we had to copy the
> > decode file, gut it, and then fill in the sim pieces to make it
> > work.  seems like this opc2c might be a way back from that.
> 
> There's no reason why it *wouldn't* work for RISC architectures, of
> course, I just never tried it, and don't know how optimal it would be
> with it.  However, if you have a RISC case where an operand field
> isn't fully used, and certain operand patterns mean a whole different
> opcode, opc2c can help you there - it will only decode to a specific
> opcode if its operands are valid too, which is *really* hard to get
> right with simple mask tables.

yeah, we've had to do a lot of checks to make sure we dont go decoding invalid 
opcodes as valid insns.  we had a lot of this originally, but after putting 
together a lot of test cases that basically test the entire opcode space to 
make sure we dont revert behavior (only a few ten thousand insns in that one 
test ;x).

> For the m32c, I used opc2c for the simulator, and cgen was used for
> everything else.  For RX, opc2c was pushed to libopcodes, and it's
> used for the simulator, disassembler, *and* gdb.  Nowhere else are RX
> opcodes decoded.  Maybe Kevin can comment on how different it was to
> use opc2c's decoder for gdb's prolog analyzer?

i havent been able to figure out cgen.  but from what i can see, with a 
complete implementation already, i cant say it's worth the effort to move to 
the cgen infrastructure.  but it does look like opc2c wouldnt be too much 
effort to integrate and it'd be worth the work to have our opcodes/sim 
integrated better.

> For the RX assembler, I used bison.  The resulting file *looks* like
> an opc2c input file - syntax followed by semantics - but I didn't try
> to use the same input file for both purposes.

yeah, we use lacc/yex too with our parser.
-mike

[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [rx sim] add decode cache
  2010-07-30  0:09         ` Mike Frysinger
@ 2010-07-30  0:17           ` DJ Delorie
  0 siblings, 0 replies; 8+ messages in thread
From: DJ Delorie @ 2010-07-30  0:17 UTC (permalink / raw)
  To: Mike Frysinger; +Cc: gdb-patches


> i havent been able to figure out cgen.  but from what i can see,
> with a complete implementation already, i cant say it's worth the
> effort to move to the cgen infrastructure.

In the case of the m32c, the cgen implementation is *so* complex, that
there's still a few opcodes it doesn't support, and I haven't been
able to figure out how to fix it yet.

CGEN at the time really didn't like variable length opcodes.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [rx sim] add decode cache
  2010-07-29 22:00       ` DJ Delorie
  2010-07-30  0:09         ` Mike Frysinger
@ 2010-07-30  1:04         ` Kevin Buettner
  1 sibling, 0 replies; 8+ messages in thread
From: Kevin Buettner @ 2010-07-30  1:04 UTC (permalink / raw)
  To: gdb-patches

On Thu, 29 Jul 2010 18:00:29 -0400
DJ Delorie <dj@redhat.com> wrote:

> For the m32c, I used opc2c for the simulator, and cgen was used for
> everything else.  For RX, opc2c was pushed to libopcodes, and it's
> used for the simulator, disassembler, *and* gdb.  Nowhere else are RX
> opcodes decoded.  Maybe Kevin can comment on how different it was to
> use opc2c's decoder for gdb's prolog analyzer?

Use of opc2c's decoder in writing GDB's prologue analyzer has several
advantages over the ad hoc decoders that we often write for a GDB
architecture port:

1) There's usually some amount of effort required to understand the
   instruction set encodings, and additional effort require to decide
   how to best decode the ones that GDB might care about.  Use of
   the opc2c decoder meant that much of that work was already done.
   The author of the prologue analyzer must still have a rudimentary
   understanding of the instruction encodings though.  (E.g. are the
   instructions fixed width or variable width?  What's the length
   of the shortest instruction?  Etc.)

2) The ad hoc instruction decoders that I've either written, or have
   worked upon for GDB usually involve masking some magic number with
   the instruction under consideration and then comparing that result
   with another magic number.  Sometimes these magic numbers are made
   a bit less magic by mapping them to a symbolic constant via the use
   of a suitable define or enum.  But coming up with them is error
   prone and it's by no means certain that code using them is
   adequately tested.

   The opc2c decoder, on the other hand, already has symbolic constants
   assigned to each instruction.  There is greater certainty that the
   decoded instruction is correct since it's used in the simulator and
   disassembler too.

3) The opc2c decoder decodes the instruction's operands too.  When
   writing an ad hoc decoder, the operand decoding is usually
   accomplished via shifts, masks, etc.   Again, there are more
   magic numbers involved, and yet more sources of error that can
   easily creep into the prologue analyzer.

   I do recall, however, being a bit surprised by the operand order
   for one of the RX instructions at one point.  Not a big deal, but
   it's worth noting that the prologue analyzer's author still has to
   take some care to get things right.

So, to summarize...  The opc2c decoder is better tested since it is
used in other tools.  Due to the fact that it provides symbolic
decoding of the opcodes and operands, prologue analyzer code is easier
to write, and has a better chance of being correct.  There is also a
savings of development time since the author of the prologue analyzer
need not do a detailed analysis of instruction and operand formats.

Kevin

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2010-07-30  1:04 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-07-29 18:42 [rx sim] add decode cache DJ Delorie
2010-07-29 19:23 ` Mike Frysinger
2010-07-29 19:34   ` DJ Delorie
2010-07-29 21:42     ` Mike Frysinger
2010-07-29 22:00       ` DJ Delorie
2010-07-30  0:09         ` Mike Frysinger
2010-07-30  0:17           ` DJ Delorie
2010-07-30  1:04         ` Kevin Buettner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).