diff --git a/trans-array.c b/trans-array.c
index acd9aec..262743d 100644
--- a/trans-array.c
+++ b/trans-array.c
@@ -6187,7 +6187,8 @@ gfc_conv_expr_descriptor (gfc_se * se, gfc_expr * expr, gfc_ss * ss)
 	    gcc_assert ((expr->value.function.esym != NULL
 			 && expr->value.function.esym->attr.elemental)
 			|| (expr->value.function.isym != NULL
-			    && expr->value.function.isym->elemental));
+			    && expr->value.function.isym->elemental)
+			|| gfc_inline_intrinsic_function_p (expr));
 	  else
 	    gcc_assert (ss_type == GFC_SS_INTRINSIC);
 
diff --git a/trans-intrinsic.c b/trans-intrinsic.c
index 25c54fb..973f912 100644
--- a/trans-intrinsic.c
+++ b/trans-intrinsic.c
@@ -2557,6 +2557,20 @@ gfc_conv_intrinsic_count (gfc_se * se, gfc_expr * expr)
   se->expr = resvar;
 }
 
+
+/* Update given gfc_se to have ss component pointing to the nested gfc_ss
+   struct and return the corresponding loopinfo.  */
+
+static gfc_loopinfo *
+enter_nested_loop (gfc_se *se)
+{
+  se->ss = se->ss->nested_ss;
+  gcc_assert (se->ss == se->ss->loop->ss);
+
+  return se->ss->loop;
+}
+
+
 /* Inline implementation of the sum and product intrinsics.  */
 static void
 gfc_conv_intrinsic_arith (gfc_se * se, gfc_expr * expr, enum tree_code op,
@@ -2570,18 +2584,18 @@ gfc_conv_intrinsic_arith (gfc_se * se, gfc_expr * expr, enum tree_code op,
   tree tmp;
   gfc_loopinfo loop, *ploop;
   gfc_actual_arglist *arg_array, *arg_mask;
-  gfc_ss *arrayss;
-  gfc_ss *maskss;
+  gfc_ss *arrayss = NULL;
+  gfc_ss *maskss = NULL;
   gfc_se arrayse;
   gfc_se maskse;
   gfc_se *parent_se;
   gfc_expr *arrayexpr;
   gfc_expr *maskexpr;
 
-  if (se->ss)
+  if (expr->rank > 0)
     {
-      gfc_conv_intrinsic_funcall (se, expr);
-      return;
+      gcc_assert (gfc_inline_intrinsic_function_p (expr));
+      parent_se = se;
     }
   else
     parent_se = NULL;
@@ -2613,10 +2627,7 @@ gfc_conv_intrinsic_arith (gfc_se * se, gfc_expr * expr, enum tree_code op,
 
   arg_array = expr->value.function.actual;
 
-  /* Walk the arguments.  */
   arrayexpr = arg_array->expr;
-  arrayss = gfc_walk_expr (arrayexpr);
-  gcc_assert (arrayss != gfc_ss_terminator);
 
   if (op == NE_EXPR || norm2)
     /* PARITY and NORM2.  */
@@ -2628,29 +2639,42 @@ gfc_conv_intrinsic_arith (gfc_se * se, gfc_expr * expr, enum tree_code op,
       maskexpr = arg_mask->expr;
     }
 
-  if (maskexpr && maskexpr->rank > 0)
+  if (expr->rank == 0)
     {
-      maskss = gfc_walk_expr (maskexpr);
-      gcc_assert (maskss != gfc_ss_terminator);
-    }
-  else
-    maskss = NULL;
+      /* Walk the arguments.  */
+      arrayss = gfc_walk_expr (arrayexpr);
+      gcc_assert (arrayss != gfc_ss_terminator);
 
-  /* Initialize the scalarizer.  */
-  gfc_init_loopinfo (&loop);
-  gfc_add_ss_to_loop (&loop, arrayss);
-  if (maskexpr && maskexpr->rank > 0)
-    gfc_add_ss_to_loop (&loop, maskss);
+      if (maskexpr && maskexpr->rank > 0)
+	{
+	  maskss = gfc_walk_expr (maskexpr);
+	  gcc_assert (maskss != gfc_ss_terminator);
+	}
+      else
+	maskss = NULL;
 
-  /* Initialize the loop.  */
-  gfc_conv_ss_startstride (&loop);
-  gfc_conv_loop_setup (&loop, &expr->where);
+      /* Initialize the scalarizer.  */
+      gfc_init_loopinfo (&loop);
+      gfc_add_ss_to_loop (&loop, arrayss);
+      if (maskexpr && maskexpr->rank > 0)
+	gfc_add_ss_to_loop (&loop, maskss);
 
-  gfc_mark_ss_chain_used (arrayss, 1);
-  if (maskexpr && maskexpr->rank > 0)
-    gfc_mark_ss_chain_used (maskss, 1);
+      /* Initialize the loop.  */
+      gfc_conv_ss_startstride (&loop);
+      gfc_conv_loop_setup (&loop, &expr->where);
+
+      gfc_mark_ss_chain_used (arrayss, 1);
+      if (maskexpr && maskexpr->rank > 0)
+	gfc_mark_ss_chain_used (maskss, 1);
+
+      ploop = &loop;
+    }
+  else
+    /* All the work has been done in the parent loops.  */
+    ploop = enter_nested_loop (se);
+
+  gcc_assert (ploop);
 
-  ploop = &loop;
   /* Generate the loop body.  */
   gfc_start_scalarized_body (ploop, &body);
 
@@ -2659,7 +2683,8 @@ gfc_conv_intrinsic_arith (gfc_se * se, gfc_expr * expr, enum tree_code op,
     {
       gfc_init_se (&maskse, parent_se);
       gfc_copy_loopinfo_to_se (&maskse, ploop);
-      maskse.ss = maskss;
+      if (expr->rank == 0)
+	maskse.ss = maskss;
       gfc_conv_expr_val (&maskse, maskexpr);
       gfc_add_block_to_block (&body, &maskse.pre);
 
@@ -2671,7 +2696,8 @@ gfc_conv_intrinsic_arith (gfc_se * se, gfc_expr * expr, enum tree_code op,
   /* Do the actual summation/product.  */
   gfc_init_se (&arrayse, parent_se);
   gfc_copy_loopinfo_to_se (&arrayse, ploop);
-  arrayse.ss = arrayss;
+  if (expr->rank == 0)
+    arrayse.ss = arrayss;
   gfc_conv_expr_val (&arrayse, arrayexpr);
   gfc_add_block_to_block (&block, &arrayse.pre);
 
@@ -2763,17 +2789,29 @@ gfc_conv_intrinsic_arith (gfc_se * se, gfc_expr * expr, enum tree_code op,
   /* For a scalar mask, enclose the loop in an if statement.  */
   if (maskexpr && maskexpr->rank == 0)
     {
-      gfc_init_se (&maskse, NULL);
-      gfc_conv_expr_val (&maskse, maskexpr);
       gfc_init_block (&block);
       gfc_add_block_to_block (&block, &ploop->pre);
       gfc_add_block_to_block (&block, &ploop->post);
       tmp = gfc_finish_block (&block);
 
-      tmp = build3_v (COND_EXPR, maskse.expr, tmp,
-		      build_empty_stmt (input_location));
+      if (expr->rank > 0)
+	{
+	  tmp = build3_v (COND_EXPR, se->ss->info->data.scalar.value, tmp,
+			  build_empty_stmt (input_location));
+	  gfc_advance_se_ss_chain (se);
+	}
+      else
+	{
+	  gcc_assert (expr->rank == 0);
+	  gfc_init_se (&maskse, NULL);
+	  gfc_conv_expr_val (&maskse, maskexpr);
+	  tmp = build3_v (COND_EXPR, maskse.expr, tmp,
+			  build_empty_stmt (input_location));
+	}
+
       gfc_add_expr_to_block (&block, tmp);
       gfc_add_block_to_block (&se->pre, &block);
+      gcc_assert (se->post.head == NULL);
     }
   else
     {
@@ -2781,7 +2819,8 @@ gfc_conv_intrinsic_arith (gfc_se * se, gfc_expr * expr, enum tree_code op,
       gfc_add_block_to_block (&se->pre, &ploop->post);
     }
 
-  gfc_cleanup_loop (ploop);
+  if (expr->rank == 0)
+    gfc_cleanup_loop (ploop);
 
   if (norm2)
     {
@@ -6801,12 +6840,127 @@ walk_inline_intrinsic_transpose (gfc_ss *ss, gfc_expr *expr)
 }
 
 
+/* Move the given dimension of the given gfc_ss list to a nested gfc_ss list.
+   This has the side effect of reversing the nested list, so there is no
+   need to call gfc_reverse_ss on it (the given list is assumed not to be
+   reversed yet).   */
+
+static gfc_ss *
+nest_loop_dimension (gfc_ss *ss, int dim)
+{
+  int ss_dim, i;
+  gfc_ss *new_ss, *prev_ss = gfc_ss_terminator;
+  gfc_loopinfo *new_loop;
+
+  gcc_assert (ss != gfc_ss_terminator);
+
+  for (; ss != gfc_ss_terminator; ss = ss->next)
+    {
+      new_ss = gfc_get_ss ();
+      new_ss->next = prev_ss;
+      new_ss->parent = ss;
+      new_ss->info = ss->info;
+      new_ss->info->refcount++;
+      if (ss->dimen != 0)
+	{
+	  gcc_assert (ss->info->type != GFC_SS_SCALAR
+		      && ss->info->type != GFC_SS_REFERENCE);
+
+	  new_ss->dimen = 1;
+	  new_ss->dim[0] = ss->dim[dim];
+
+	  gcc_assert (dim < ss->dimen);
+
+	  ss_dim = --ss->dimen;
+	  for (i = dim; i < ss_dim; i++)
+	    ss->dim[i] = ss->dim[i + 1];
+
+	  ss->dim[ss_dim] = 0;
+	}
+      prev_ss = new_ss;
+
+      if (ss->nested_ss)
+	{
+	  ss->nested_ss->parent = new_ss;
+	  new_ss->nested_ss = ss->nested_ss;
+	}
+      ss->nested_ss = new_ss;
+    }
+
+  new_loop = gfc_get_loopinfo ();
+  gfc_init_loopinfo (new_loop);
+
+  gcc_assert (prev_ss != NULL);
+  gcc_assert (prev_ss != gfc_ss_terminator);
+  gfc_add_ss_to_loop (new_loop, prev_ss);
+  return new_ss->parent;
+}
+
+
+/* Create the gfc_ss list for the SUM/PRODUCT arguments when the function
+   is to be inlined.  */
+
+static gfc_ss *
+walk_inline_intrinsic_arith (gfc_ss *ss, gfc_expr *expr)
+{
+  gfc_ss *tmp_ss, *tail, *array_ss;
+  gfc_actual_arglist *arg1, *arg2, *arg3;
+  int sum_dim;
+  bool scalar_mask = false;
+
+  /* The rank of the result will be determined later.  */
+  arg1 = expr->value.function.actual;
+  arg2 = arg1->next;
+  arg3 = arg2->next;
+  gcc_assert (arg3 != NULL);
+
+  if (expr->rank == 0)
+    return ss;
+
+  tmp_ss = gfc_ss_terminator;
+
+  if (arg3->expr)
+    {
+      gfc_ss *mask_ss;
+
+      mask_ss = gfc_walk_subexpr (tmp_ss, arg3->expr);
+      if (mask_ss == tmp_ss)
+	scalar_mask = 1;
+
+      tmp_ss = mask_ss;
+    }
+
+  array_ss = gfc_walk_subexpr (tmp_ss, arg1->expr);
+  gcc_assert (array_ss != tmp_ss);
+
+  /* Odd thing: If the mask is scalar, it is used by the frontend after
+     the array (to make it array around the nested loop). Thus it shall
+     be after array_ss once the gfc_ss list is reversed.  */
+  if (scalar_mask)
+    tmp_ss = gfc_get_scalar_ss (array_ss, arg3->expr);
+  else
+    tmp_ss = array_ss;
+
+  /* "Hide" the dimension on which we will sum in the first arg's scalarization
+     chain.  */
+  sum_dim = mpz_get_si (arg2->expr->value.integer) - 1;
+  tail = nest_loop_dimension (tmp_ss, sum_dim);
+  tail->next = ss;
+
+  return tmp_ss;
+}
+
+
 static gfc_ss *
 walk_inline_intrinsic_function (gfc_ss * ss, gfc_expr * expr)
 {
 
   switch (expr->value.function.isym->id)
     {
+      case GFC_ISYM_PRODUCT:
+      case GFC_ISYM_SUM:
+	return walk_inline_intrinsic_arith (ss, expr);
+
       case GFC_ISYM_TRANSPOSE:
 	return walk_inline_intrinsic_transpose (ss, expr);
 
@@ -6868,11 +7022,26 @@ gfc_walk_intrinsic_libfunc (gfc_ss * ss, gfc_expr * expr)
 bool
 gfc_inline_intrinsic_function_p (gfc_expr *expr)
 {
+  gfc_actual_arglist *args;
+
   if (!expr->value.function.isym)
     return false;
 
   switch (expr->value.function.isym->id)
     {
+    case GFC_ISYM_PRODUCT:
+    case GFC_ISYM_SUM:
+      /* Disable inline expansion if code size matters.  */
+      if (optimize_size)
+	return false;
+
+      args = expr->value.function.actual;
+      /* We need to be able to subset the SUM argument at compile-time.  */
+      if (args->next->expr && args->next->expr->expr_type != EXPR_CONSTANT)
+	return false;
+
+      return true;
+
     case GFC_ISYM_TRANSPOSE:
       return true;
 
diff --git a/trans.h b/trans.h
index 5757865..22033d3 100644
--- a/trans.h
+++ b/trans.h
@@ -310,6 +310,7 @@ typedef struct gfc_loopinfo
 }
 gfc_loopinfo;
 
+#define gfc_get_loopinfo() XCNEW (gfc_loopinfo)
 
 /* Information about a symbol that has been shadowed by a temporary.  */
 typedef struct