diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 8a869eb3b30..d5116801498 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -23270,6 +23270,116 @@ ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
   return true;
 }
 
+void
+ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+  machine_mode qimode = GET_MODE (dest);
+  rtx qop1, qop2, hop1, hop2, qdest, hres;
+  bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
+  bool uns_p = true;
+
+  switch (qimode)
+    {
+    case E_V4QImode:
+    case E_V8QImode:
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  qop1 = lowpart_subreg (V16QImode, force_reg (qimode, op1), qimode);
+
+  if (op2vec)
+    qop2 = lowpart_subreg (V16QImode, force_reg (qimode, op2), qimode);
+  else
+    qop2 = op2;
+
+  switch (code)
+    {
+    case MULT:
+      gcc_assert (op2vec);
+      /* Unpack data such that we've got a source byte in each low byte of
+	 each word.  We don't care what goes into the high byte of each word.
+	 Rather than trying to get zero in there, most convenient is to let
+	 it be a copy of the low byte.  */
+      hop1 = copy_to_reg (qop1);
+      hop2 = copy_to_reg (qop2);
+      emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
+      emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
+      break;
+
+    case ASHIFTRT:
+      uns_p = false;
+      /* FALLTHRU */
+    case ASHIFT:
+    case LSHIFTRT:
+      hop1 = gen_reg_rtx (V8HImode);
+      ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
+      /* vashr/vlshr/vashl  */
+      if (op2vec)
+	{
+	  hop2 = gen_reg_rtx (V8HImode);
+	  ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
+	}
+      else
+	hop2 = qop2;
+
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  if (code != MULT && op2vec)
+    {
+      /* Expand vashr/vlshr/vashl.  */
+      hres = gen_reg_rtx (V8HImode);
+      emit_insn (gen_rtx_SET (hres,
+			      simplify_gen_binary (code, V8HImode,
+						   hop1, hop2)));
+    }
+  else
+    /* Expand mult/ashr/lshr/ashl.  */
+    hres = expand_simple_binop (V8HImode, code, hop1, hop2,
+				NULL_RTX, 1, OPTAB_DIRECT);
+
+  if (TARGET_AVX512BW && TARGET_AVX512VL)
+    {
+      if (qimode == V8QImode)
+	qdest = dest;
+      else
+	qdest = gen_reg_rtx (V8QImode);
+
+      emit_insn (gen_truncv8hiv8qi2 (qdest, hres));
+    }
+  else
+    {
+      struct expand_vec_perm_d d;
+      rtx qres = gen_lowpart (V16QImode, hres);
+      bool ok;
+      int i;
+
+      qdest = gen_reg_rtx (V16QImode);
+
+      /* Merge the data back into the right place.  */
+      d.target = qdest;
+      d.op0 = qres;
+      d.op1 = qres;
+      d.vmode = V16QImode;
+      d.nelt = 16;
+      d.one_operand_p = false;
+      d.testing_p = false;
+
+      for (i = 0; i < d.nelt; ++i)
+	d.perm[i] = i * 2;
+
+      ok = ix86_expand_vec_perm_const_1 (&d);
+      gcc_assert (ok);
+    }
+
+  if (qdest != dest)
+    emit_move_insn (dest, gen_lowpart (qimode, qdest));
+}
+
 /* Expand a vector operation CODE for a V*QImode in terms of the
    same operation on V*HImode.  */
 
@@ -23281,6 +23391,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
   rtx (*gen_il) (rtx, rtx, rtx);
   rtx (*gen_ih) (rtx, rtx, rtx);
   rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
+  bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
   struct expand_vec_perm_d d;
   bool full_interleave = true;
   bool uns_p = true;
@@ -23315,6 +23426,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
   switch (code)
     {
     case MULT:
+      gcc_assert (op2vec);
       /* Unpack data such that we've got a source byte in each low byte of
 	 each word.  We don't care what goes into the high byte of each word.
 	 Rather than trying to get zero in there, most convenient is to let
@@ -23360,7 +23472,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
       ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
       ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
       /* vashr/vlshr/vashl  */
-      if (GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
+      if (op2vec)
 	{
 	  rtx tmp = force_reg (qimode, op2);
 	  op2_l = gen_reg_rtx (himode);
@@ -23376,8 +23488,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
       gcc_unreachable ();
     }
 
-  if (code != MULT
-      && GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
+  if (code != MULT && op2vec)
     {
       /* Expand vashr/vlshr/vashl.  */
       res_l = gen_reg_rtx (himode);
@@ -23435,9 +23546,6 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
 
   ok = ix86_expand_vec_perm_const_1 (&d);
   gcc_assert (ok);
-
-  set_unique_reg_note (get_last_insn (), REG_EQUAL,
-		       gen_rtx_fmt_ee (code, qimode, op1, op2));
 }
 
 /* Helper function of ix86_expand_mul_widen_evenodd.  Return true
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 71ae95ffef7..d0f5783173e 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -215,6 +215,7 @@ extern void ix86_expand_round (rtx, rtx);
 extern void ix86_expand_rounddf_32 (rtx, rtx);
 extern void ix86_expand_round_sse4 (rtx, rtx);
 
+extern void ix86_expand_vecop_qihi_partial (enum rtx_code, rtx, rtx, rtx);
 extern void ix86_expand_vecop_qihi (enum rtx_code, rtx, rtx, rtx);
 extern rtx ix86_split_stack_guard (void);
 
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 9ab24242b59..369a718c880 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20465,6 +20465,14 @@ ix86_multiplication_cost (const struct processor_costs *cost,
   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
     switch (mode)
       {
+      case V4QImode:
+      case V8QImode:
+	/* Partial V*QImode is emulated with 4-5 insns.  */
+	if ((TARGET_AVX512BW && TARGET_AVX512VL) || TARGET_XOP)
+	  return ix86_vec_cost (mode, cost->mulss + cost->sse_op * 3);
+	else
+	  return ix86_vec_cost (mode, cost->mulss + cost->sse_op * 4);
+
       case V16QImode:
 	/* V*QImode is emulated with 4-11 insns.  */
 	if (TARGET_AVX512BW && TARGET_AVX512VL)
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index b2954fff8ae..45773673049 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -2149,6 +2149,26 @@ (define_insn "mulv2hi3"
    (set_attr "type" "ssemul")
    (set_attr "mode" "TI")])
 
+(define_expand "mulv8qi3"
+  [(set (match_operand:V8QI 0 "register_operand")
+	(mult:V8QI (match_operand:V8QI 1 "register_operand")
+		   (match_operand:V8QI 2 "register_operand")))]
+  "TARGET_MMX_WITH_SSE"
+{
+  ix86_expand_vecop_qihi_partial (MULT, operands[0], operands[1], operands[2]);
+  DONE;
+})
+
+(define_expand "mulv4qi3"
+  [(set (match_operand:V4QI 0 "register_operand")
+	(mult:V4QI (match_operand:V4QI 1 "register_operand")
+		   (match_operand:V4QI 2 "register_operand")))]
+  "TARGET_SSE2"
+{
+  ix86_expand_vecop_qihi_partial (MULT, operands[0], operands[1], operands[2]);
+  DONE;
+})
+
 (define_expand "mmx_smulv4hi3_highpart"
   [(set (match_operand:V4HI 0 "register_operand")
 	(truncate:V4HI
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index f14a9c24ebd..26dd0b1aa10 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -14987,16 +14987,6 @@ (define_split
 	(eq:VI12_AVX2 (match_dup 4) (match_dup 1)))]
   "operands[4] = gen_reg_rtx (<MODE>mode);")
 
-(define_expand "mulv8qi3"
-  [(set (match_operand:V8QI 0 "register_operand")
-	(mult:V8QI (match_operand:V8QI 1 "register_operand")
-		   (match_operand:V8QI 2 "register_operand")))]
-  "TARGET_AVX512VL && TARGET_AVX512BW && TARGET_64BIT"
-{
-  ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]);
-  DONE;
-})
-
 (define_expand "mul<mode>3"
   [(set (match_operand:VI1_AVX512 0 "register_operand")
 	(mult:VI1_AVX512 (match_operand:VI1_AVX512 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c
index dc684a167c8..5e9f4f2805c 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c
@@ -1,7 +1,8 @@
 /* PR target/pr95488  */
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx512bw -mavx512vl" }  */
-/* { dg-final { scan-assembler-times "vpmovzxbw" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovzxbw" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpunpcklbw" 4 { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-times "vpmullw\[^\n\]*ymm" 2 } } */
 /* { dg-final { scan-assembler-times "vpmullw\[^\n\]*xmm" 2 { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-times "vpmovwb" 4 { target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-mulv4qi.c b/gcc/testsuite/gcc.target/i386/vect-mulv4qi.c
new file mode 100644
index 00000000000..d64bf044e91
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-mulv4qi.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msse2" } */
+
+#define N 4
+
+unsigned char ur[N], ua[N], ub[N];
+
+void mul (void)
+{
+  int i;
+
+  for (i = 0; i < N; i++)
+    ur[i] = ua[i] * ub[i];
+}
+
+void mul_slp (void)
+{
+  ur[0] = ua[0] * ub[0];
+  ur[1] = ua[1] * ub[1];
+  ur[2] = ua[2] * ub[2];
+  ur[3] = ua[3] * ub[3];
+}
+
+/* { dg-final { scan-assembler-times "pmullw" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-mulv8qi.c b/gcc/testsuite/gcc.target/i386/vect-mulv8qi.c
new file mode 100644
index 00000000000..05003644ec7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-mulv8qi.c
@@ -0,0 +1,28 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -ftree-vectorize -msse2" } */
+
+#define N 8
+
+unsigned char ur[N], ua[N], ub[N];
+
+void mul (void)
+{
+  int i;
+
+  for (i = 0; i < N; i++)
+    ur[i] = ua[i] * ub[i];
+}
+
+void mul_slp (void)
+{
+  ur[0] = ua[0] * ub[0];
+  ur[1] = ua[1] * ub[1];
+  ur[2] = ua[2] * ub[2];
+  ur[3] = ua[3] * ub[3];
+  ur[4] = ua[4] * ub[4];
+  ur[5] = ua[5] * ub[5];
+  ur[6] = ua[6] * ub[6];
+  ur[7] = ua[7] * ub[7];
+}
+
+/* { dg-final { scan-assembler-times "pmullw" 2 } } */