From 54d4d0ff6cd40638d026c01e46deb102e7951ba6 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 16 Sep 2017 13:20:32 -0400
Subject: [PATCH] Fix SQL-spec incompatibilities in new transition table
 feature.

The standard says that all changes of the same kind (insert, update, or
delete) caused in one table by a single SQL statement should be reported
in a single transition table; and by that, they mean to include foreign key
enforcement actions cascading from the statement's direct effects.  It's
also reasonable to conclude that if the standard had wCTEs, they would say
that effects of wCTEs applying to the same table as each other or the outer
statement should be merged into one transition table.  We weren't doing it
like that.

Hence, arrange to merge tuples from multiple update actions into a single
transition table as much as we can.  There is a problem, which is that if
the firing of FK enforcement triggers and after-row triggers with
transition tables is interspersed, we might need to report more tuples
after some triggers have already seen the transition table.  It seems like
a bad idea for the transition table to be mutable between trigger calls.
There's no good way around this without a major redesign of the FK logic,
so for now, resolve it by opening a new transition table each time this
happens.

Also, ensure that AFTER STATEMENT triggers fire just once per statement,
or once per transition table when we're forced to make more than one.
Previous versions of Postgres have allowed each FK enforcement query
to cause an additional firing of the AFTER STATEMENT triggers for the
referencing table, but that's certainly not per spec.  (We're still
doing multiple firings of BEFORE STATEMENT triggers, though; is that
something worth changing?)

Also, forbid using transition tables with column-specific UPDATE triggers.
The spec requires such transition tables to show only the tuples for which
the UPDATE trigger would have fired, which means maintaining multiple
transition tables or else somehow filtering the contents at readout.
Maybe someday we'll bother to support that option, but it looks like a
lot of trouble for a marginal feature.

The transition tables are now managed by the AfterTriggers data structures,
rather than being directly the responsibility of ModifyTable nodes.  This
removes a subtransaction-lifespan memory leak introduced by my previous
band-aid patch 3c4359521.

In passing, refactor the AfterTriggers data structures to reduce the
management overhead for them, by using arrays of structs rather than
several parallel arrays for per-query-level and per-subtransaction state.

I failed to resist the temptation to do some copy-editing on the SGML
docs about triggers, above and beyond merely documenting the effects
of this patch.

Back-patch to v10, because we don't want the semantics of transition
tables to change post-release.

Patch by me, with help and review from Thomas Munro.

Discussion: https://postgr.es/m/20170909064853.25630.12825@wrigleys.postgresql.org
---
 doc/src/sgml/ref/create_trigger.sgml   | 112 +++-
 doc/src/sgml/trigger.sgml              |  54 +-
 src/backend/commands/copy.c            |  10 +-
 src/backend/commands/trigger.c         | 822 ++++++++++++++++---------
 src/backend/executor/README            |   2 +-
 src/backend/executor/execMain.c        |  11 +-
 src/backend/executor/nodeModifyTable.c |  57 +-
 src/include/commands/trigger.h         |  29 +-
 src/include/nodes/execnodes.h          |   4 +-
 src/test/regress/expected/triggers.out |  52 +-
 src/test/regress/sql/triggers.sql      |  42 ++
 11 files changed, 804 insertions(+), 391 deletions(-)
diff --git a/doc/src/sgml/ref/create_trigger.sgml b/doc/src/sgml/ref/create_trigger.sgml
index 18efe6a9ed7..065c8272710 100644
--- a/doc/src/sgml/ref/create_trigger.sgml
+++ b/doc/src/sgml/ref/create_trigger.sgml
@@ -52,7 +52,7 @@ CREATE [ CONSTRAINT ] TRIGGER <replaceable class="PARAMETER">name</replaceable>
    trigger will be associated with the specified table, view, or foreign table
    and will execute the specified
    function <replaceable class="parameter">function_name</replaceable> when
-   certain events occur.
+   certain operations are performed on that table.
   </para>
 
   <para>
@@ -82,10 +82,7 @@ CREATE [ CONSTRAINT ] TRIGGER <replaceable class="PARAMETER">name</replaceable>
    executes once for any given operation, regardless of how many rows
    it modifies (in particular, an operation that modifies zero rows
    will still result in the execution of any applicable <literal>FOR
-   EACH STATEMENT</literal> triggers).  Note that with an
-   <command>INSERT</command> with an <literal>ON CONFLICT DO UPDATE</>
-   clause, both <command>INSERT</command> and
-   <command>UPDATE</command> statement level trigger will be fired.
+   EACH STATEMENT</literal> triggers).
   </para>
 
   <para>
@@ -174,7 +171,8 @@ CREATE [ CONSTRAINT ] TRIGGER <replaceable class="PARAMETER">name</replaceable>
    <firstterm>constraint trigger</>.  This is the same as a regular trigger
    except that the timing of the trigger firing can be adjusted using
    <xref linkend="SQL-SET-CONSTRAINTS">.
-   Constraint triggers must be <literal>AFTER ROW</> triggers on tables.  They
+   Constraint triggers must be <literal>AFTER ROW</> triggers on plain
+   tables (not foreign tables).  They
    can be fired either at the end of the statement causing the triggering
    event, or at the end of the containing transaction; in the latter case they
    are said to be <firstterm>deferred</>.  A pending deferred-trigger firing
@@ -184,18 +182,29 @@ CREATE [ CONSTRAINT ] TRIGGER <replaceable class="PARAMETER">name</replaceable>
   </para>
 
   <para>
-   The <literal>REFERENCING</> option is only allowed for an <literal>AFTER</>
-   trigger which is not a constraint trigger.  <literal>OLD TABLE</> may only
-   be specified once, and only on a trigger which can fire on
-   <literal>UPDATE</> or <literal>DELETE</>.  <literal>NEW TABLE</> may only
-   be specified once, and only on a trigger which can fire on
-   <literal>UPDATE</> or <literal>INSERT</>.
+   The <literal>REFERENCING</> option enables collection
+   of <firstterm>transition relations</>, which are row sets that include all
+   of the rows inserted, deleted, or modified by the current SQL statement.
+   This feature lets the trigger see a global view of what the statement did,
+   not just one row at a time.  This option is only allowed for
+   an <literal>AFTER</> trigger that is not a constraint trigger; also, if
+   the trigger is an <literal>UPDATE</> trigger, it must not specify
+   a <replaceable class="parameter">column_name</replaceable> list.
+   <literal>OLD TABLE</> may only be specified once, and only for a trigger
+   that can fire on <literal>UPDATE</> or <literal>DELETE</>; it creates a
+   transition relation containing the <firstterm>before-images</> of all rows
+   updated or deleted by the statement.
+   Similarly, <literal>NEW TABLE</> may only be specified once, and only for
+   a trigger that can fire on <literal>UPDATE</> or <literal>INSERT</>;
+   it creates a transition relation containing the <firstterm>after-images</>
+   of all rows updated or inserted by the statement.
   </para>
 
   <para>
    <command>SELECT</command> does not modify any rows so you cannot
-   create <command>SELECT</command> triggers. Rules and views are more
-   appropriate in such cases.
+   create <command>SELECT</command> triggers.  Rules and views may provide
+   workable solutions to problems that seem to need <command>SELECT</command>
+   triggers.
   </para>
 
   <para>
@@ -300,12 +309,9 @@ UPDATE OF <replaceable>column_name1</replaceable> [, <replaceable>column_name2</
     <term><literal>REFERENCING</literal></term>
     <listitem>
      <para>
-      This immediately precedes the declaration of one or two relations which
-      can be used to read the before and/or after images of all rows directly
-      affected by the triggering statement.  An <literal>AFTER EACH ROW</>
-      trigger is allowed to use both these transition relation names and the
-      row names (<literal>OLD</> and <literal>NEW</>) which reference each
-      individual row for which the trigger fires.
+      This keyword immediately precedes the declaration of one or two
+      relation names that provide access to the transition relations of the
+      triggering statement.
      </para>
     </listitem>
    </varlistentry>
@@ -315,8 +321,9 @@ UPDATE OF <replaceable>column_name1</replaceable> [, <replaceable>column_name2</
     <term><literal>NEW TABLE</literal></term>
     <listitem>
      <para>
-      This specifies whether the named relation contains the before or after
-      images for rows affected by the statement which fired the trigger.
+      This clause indicates whether the following relation name is for the
+      before-image transition relation or the after-image transition
+      relation.
      </para>
     </listitem>
    </varlistentry>
@@ -325,7 +332,8 @@ UPDATE OF <replaceable>column_name1</replaceable> [, <replaceable>column_name2</
     <term><replaceable class="PARAMETER">transition_relation_name</replaceable></term>
     <listitem>
      <para>
-      The (unqualified) name to be used within the trigger for this relation.
+      The (unqualified) name to be used within the trigger for this
+      transition relation.
      </para>
     </listitem>
    </varlistentry>
@@ -458,6 +466,35 @@ UPDATE OF <replaceable>column_name1</replaceable> [, <replaceable>column_name2</
    rows.
   </para>
 
+  <para>
+   In some cases it is possible for a single SQL command to fire more than
+   one kind of trigger.  For instance an <command>INSERT</command> with
+   an <literal>ON CONFLICT DO UPDATE</> clause may cause both insert and
+   update operations, so it will fire both kinds of triggers as needed.
+   The transition relations supplied to triggers are
+   specific to their event type; thus an <command>INSERT</command> trigger
+   will see only the inserted rows, while an <command>UPDATE</command>
+   trigger will see only the updated rows.
+  </para>
+
+  <para>
+   Row updates or deletions caused by foreign-key enforcement actions, such
+   as <literal>ON UPDATE CASCADE</> or <literal>ON DELETE SET NULL</>, are
+   treated as part of the SQL command that caused them (note that such
+   actions are never deferred).  Relevant triggers on the affected table will
+   be fired, so that this provides another way in which a SQL command might
+   fire triggers not directly matching its type.  In simple cases, triggers
+   that request transition relations will see all changes caused in their
+   table by a single original SQL command as a single transition relation.
+   However, there are cases in which the presence of an <literal>AFTER ROW</>
+   trigger that requests transition relations will cause the foreign-key
+   enforcement actions triggered by a single SQL command to be split into
+   multiple steps, each with its own transition relation(s).  In such cases,
+   any <literal>AFTER STATEMENT</> triggers that are present will be fired
+   once per creation of a transition relation, ensuring that the triggers see
+   each affected row once and only once.
+  </para>
+
   <para>
     Modifying a partitioned table or a table with inheritance children fires
     statement-level triggers directly attached to that table, but not
@@ -589,19 +626,30 @@ CREATE TRIGGER paired_items_update
    <itemizedlist>
     <listitem>
      <para>
-      While transition tables for <literal>AFTER</> triggers are specified
-      using the <literal>REFERENCING</> clause in the standard way, the row
-      variables used in <literal>FOR EACH ROW</> triggers may not be
-      specified in <literal>REFERENCING</> clause.  They are available in a
-      manner which is dependent on the language in which the trigger function
-      is written.  Some languages effectively behave as though there is a
-      <literal>REFERENCING</> clause containing <literal>OLD ROW AS OLD NEW
-      ROW AS NEW</>.
+      While transition table names for <literal>AFTER</> triggers are
+      specified using the <literal>REFERENCING</> clause in the standard way,
+      the row variables used in <literal>FOR EACH ROW</> triggers may not be
+      specified in a <literal>REFERENCING</> clause.  They are available in a
+      manner that is dependent on the language in which the trigger function
+      is written, but is fixed for any one language.  Some languages
+      effectively behave as though there is a <literal>REFERENCING</> clause
+      containing <literal>OLD ROW AS OLD NEW ROW AS NEW</>.
      </para>
     </listitem>
 
     <listitem>
-     <para><productname>PostgreSQL</productname> only allows the execution
+     <para>
+      The standard allows transition tables to be used with
+      column-specific <literal>UPDATE</> triggers, but then the set of rows
+      that should be visible in the transition tables depends on the
+      trigger's column list.  This is not currently implemented by
+      <productname>PostgreSQL</productname>.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      <productname>PostgreSQL</productname> only allows the execution
       of a user-defined function for the triggered action.  The standard
       allows the execution of a number of other SQL commands, such as
       <command>CREATE TABLE</command>, as the triggered action.  This
diff --git a/doc/src/sgml/trigger.sgml b/doc/src/sgml/trigger.sgml
index 950245d19a2..a16256056f0 100644
--- a/doc/src/sgml/trigger.sgml
+++ b/doc/src/sgml/trigger.sgml
@@ -41,17 +41,13 @@
     On tables and foreign tables, triggers can be defined to execute either
     before or after any <command>INSERT</command>, <command>UPDATE</command>,
     or <command>DELETE</command> operation, either once per modified row,
-    or once per <acronym>SQL</acronym> statement.  If an
-    <command>INSERT</command> contains an <literal>ON CONFLICT DO UPDATE</>
-    clause, it is possible that the effects of a BEFORE insert trigger and
-    a BEFORE update trigger can both be applied together, if a reference to
-    an <varname>EXCLUDED</> column appears.  <command>UPDATE</command>
-    triggers can moreover be set to fire only if certain columns are
-    mentioned in the <literal>SET</literal> clause of the
-    <command>UPDATE</command> statement.  Triggers can also fire for
-    <command>TRUNCATE</command> statements.  If a trigger event occurs,
+    or once per <acronym>SQL</acronym> statement.
+    <command>UPDATE</command> triggers can moreover be set to fire only if
+    certain columns are mentioned in the <literal>SET</literal> clause of
+    the <command>UPDATE</command> statement.  Triggers can also fire
+    for <command>TRUNCATE</command> statements.  If a trigger event occurs,
     the trigger's function is called at the appropriate time to handle the
-    event.  Foreign tables do not support the TRUNCATE statement at all.
+    event.
    </para>
 
    <para>
@@ -97,10 +93,7 @@
     two types of triggers are sometimes called <firstterm>row-level</>
     triggers and <firstterm>statement-level</> triggers,
     respectively. Triggers on <command>TRUNCATE</command> may only be
-    defined at statement level.  On views, triggers that fire before or
-    after may only be defined at statement level, while triggers that fire
-    instead of an <command>INSERT</command>, <command>UPDATE</command>,
-    or <command>DELETE</command> may only be defined at row level.
+    defined at statement level, not per-row.
    </para>
 
    <para>
@@ -117,9 +110,9 @@
     operated on, while row-level <literal>AFTER</> triggers fire at the end of
     the statement (but before any statement-level <literal>AFTER</> triggers).
     These types of triggers may only be defined on non-partitioned tables and
-    foreign tables.  Row-level <literal>INSTEAD OF</> triggers may only be
-    defined on views, and fire immediately as each row in the view is
-    identified as needing to be operated on.
+    foreign tables, not views.  <literal>INSTEAD OF</> triggers may only be
+    defined on views, and only at row level; they fire immediately as each
+    row in the view is identified as needing to be operated on.
    </para>
 
    <para>
@@ -132,18 +125,19 @@
 
    <para>
     If an <command>INSERT</command> contains an <literal>ON CONFLICT
-    DO UPDATE</> clause, it is possible that the effects of all
-    row-level <literal>BEFORE</> <command>INSERT</command> triggers
-    and all row-level <literal>BEFORE</literal> <command>UPDATE</command> triggers can
+    DO UPDATE</> clause, it is possible that the effects of
+    row-level <literal>BEFORE</> <command>INSERT</command> triggers and
+    row-level <literal>BEFORE</literal> <command>UPDATE</command> triggers can
     both be applied in a way that is apparent from the final state of
     the updated row, if an <varname>EXCLUDED</> column is referenced.
     There need not be an <varname>EXCLUDED</> column reference for
-    both sets of row-level <literal>BEFORE</literal> triggers to execute, though.  The
+    both sets of row-level <literal>BEFORE</literal> triggers to execute,
+    though.  The
     possibility of surprising outcomes should be considered when there
     are both <literal>BEFORE</> <command>INSERT</command> and
     <literal>BEFORE</> <command>UPDATE</command> row-level triggers
-    that both affect a row being inserted/updated (this can still be
-    problematic if the modifications are more or less equivalent if
+    that change a row being inserted/updated (this can be
+    problematic even if the modifications are more or less equivalent, if
     they're not also idempotent).  Note that statement-level
     <command>UPDATE</command> triggers are executed when <literal>ON
     CONFLICT DO UPDATE</> is specified, regardless of whether or not
@@ -314,8 +308,18 @@
     <varname>NEW</varname> row for <command>INSERT</command> and
     <command>UPDATE</command> triggers, and/or the <varname>OLD</varname> row
     for <command>UPDATE</command> and <command>DELETE</command> triggers.
-    Statement-level triggers do not currently have any way to examine the
-    individual row(s) modified by the statement.
+   </para>
+
+   <para>
+    By default, statement-level triggers do not have any way to examine the
+    individual row(s) modified by the statement.  But an <literal>AFTER
+    STATEMENT</> trigger can request that <firstterm>transition tables</>
+    be created to make the sets of affected rows available to the trigger.
+    <literal>AFTER ROW</> triggers can also request transition tables, so
+    that they can see the total changes in the table as well as the change in
+    the individual row they are currently being fired for.  The syntax for
+    examining the transition tables again depends on the programming language
+    that is being used.
    </para>
 
   </sect1>
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 375a25fbcf8..ad1fcd8d77b 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -2429,12 +2429,17 @@ CopyFrom(CopyState cstate)
 	/* Triggers might need a slot as well */
 	estate->es_trig_tuple_slot = ExecInitExtraTupleSlot(estate);
 
+	/* Prepare to catch AFTER triggers. */
+	AfterTriggerBeginQuery();
+
 	/*
 	 * If there are any triggers with transition tables on the named relation,
 	 * we need to be prepared to capture transition tuples.
 	 */
 	cstate->transition_capture =
-		MakeTransitionCaptureState(cstate->rel->trigdesc);
+		MakeTransitionCaptureState(cstate->rel->trigdesc,
+								   RelationGetRelid(cstate->rel),
+								   CMD_INSERT);
 
 	/*
 	 * If the named relation is a partitioned table, initialize state for
@@ -2510,9 +2515,6 @@ CopyFrom(CopyState cstate)
 		bufferedTuples = palloc(MAX_BUFFERED_TUPLES * sizeof(HeapTuple));
 	}
 
-	/* Prepare to catch AFTER triggers. */
-	AfterTriggerBeginQuery();
-
 	/*
 	 * Check BEFORE STATEMENT insertion triggers. It's debatable whether we
 	 * should do this for COPY, since it's not really an "INSERT" statement as
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index 269c9e17dd1..7e391a10921 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -234,6 +234,11 @@ CreateTrigger(CreateTrigStmt *stmt, const char *queryString,
 							RelationGetRelationName(rel)),
 					 errdetail("Foreign tables cannot have TRUNCATE triggers.")));
 
+		/*
+		 * We disallow constraint triggers to protect the assumption that
+		 * triggers on FKs can't be deferred.  See notes with AfterTriggers
+		 * data structures, below.
+		 */
 		if (stmt->isconstraint)
 			ereport(ERROR,
 					(errcode(ERRCODE_WRONG_OBJECT_TYPE),
@@ -418,6 +423,26 @@ CreateTrigger(CreateTrigStmt *stmt, const char *queryString,
 						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 						 errmsg("transition tables cannot be specified for triggers with more than one event")));
 
+			/*
+			 * We currently don't allow column-specific triggers with
+			 * transition tables.  Per spec, that seems to require
+			 * accumulating separate transition tables for each combination of
+			 * columns, which is a lot of work for a rather marginal feature.
+			 */
+			if (stmt->columns != NIL)
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("transition tables cannot be specified for triggers with column lists")));
+
+			/*
+			 * We disallow constraint triggers with transition tables, to
+			 * protect the assumption that such triggers can't be deferred.
+			 * See notes with AfterTriggers data structures, below.
+			 *
+			 * Currently this is enforced by the grammar, so just Assert here.
+			 */
+			Assert(!stmt->isconstraint);
+
 			if (tt->isNew)
 			{
 				if (!(TRIGGER_FOR_INSERT(tgtype) ||
@@ -2085,96 +2110,6 @@ FindTriggerIncompatibleWithInheritance(TriggerDesc *trigdesc)
 	return NULL;
 }
 
-/*
- * Make a TransitionCaptureState object from a given TriggerDesc.  The
- * resulting object holds the flags which control whether transition tuples
- * are collected when tables are modified, and the tuplestores themselves.
- * Note that we copy the flags from a parent table into this struct (rather
- * than using each relation's TriggerDesc directly) so that we can use it to
- * control the collection of transition tuples from child tables.
- *
- * If there are no triggers with transition tables configured for 'trigdesc',
- * then return NULL.
- *
- * The resulting object can be passed to the ExecAR* functions.  The caller
- * should set tcs_map or tcs_original_insert_tuple as appropriate when dealing
- * with child tables.
- */
-TransitionCaptureState *
-MakeTransitionCaptureState(TriggerDesc *trigdesc)
-{
-	TransitionCaptureState *state = NULL;
-
-	if (trigdesc != NULL &&
-		(trigdesc->trig_delete_old_table || trigdesc->trig_update_old_table ||
-		 trigdesc->trig_update_new_table || trigdesc->trig_insert_new_table))
-	{
-		MemoryContext oldcxt;
-		ResourceOwner saveResourceOwner;
-
-		/*
-		 * Normally DestroyTransitionCaptureState should be called after
-		 * executing all AFTER triggers for the current statement.
-		 *
-		 * To handle error cleanup, TransitionCaptureState and the tuplestores
-		 * it contains will live in the current [sub]transaction's memory
-		 * context.  Likewise for the current resource owner, because we also
-		 * want to clean up temporary files spilled to disk by the tuplestore
-		 * in that scenario.  This scope is sufficient, because AFTER triggers
-		 * with transition tables cannot be deferred (only constraint triggers
-		 * can be deferred, and constraint triggers cannot have transition
-		 * tables).  The AFTER trigger queue may contain pointers to this
-		 * TransitionCaptureState, but any such entries will be processed or
-		 * discarded before the end of the current [sub]transaction.
-		 *
-		 * If a future release allows deferred triggers with transition
-		 * tables, we'll need to reconsider the scope of the
-		 * TransitionCaptureState object.
-		 */
-		oldcxt = MemoryContextSwitchTo(CurTransactionContext);
-		saveResourceOwner = CurrentResourceOwner;
-
-		state = (TransitionCaptureState *)
-			palloc0(sizeof(TransitionCaptureState));
-		state->tcs_delete_old_table = trigdesc->trig_delete_old_table;
-		state->tcs_update_old_table = trigdesc->trig_update_old_table;
-		state->tcs_update_new_table = trigdesc->trig_update_new_table;
-		state->tcs_insert_new_table = trigdesc->trig_insert_new_table;
-		PG_TRY();
-		{
-			CurrentResourceOwner = CurTransactionResourceOwner;
-			if (trigdesc->trig_delete_old_table || trigdesc->trig_update_old_table)
-				state->tcs_old_tuplestore = tuplestore_begin_heap(false, false, work_mem);
-			if (trigdesc->trig_insert_new_table)
-				state->tcs_insert_tuplestore = tuplestore_begin_heap(false, false, work_mem);
-			if (trigdesc->trig_update_new_table)
-				state->tcs_update_tuplestore = tuplestore_begin_heap(false, false, work_mem);
-		}
-		PG_CATCH();
-		{
-			CurrentResourceOwner = saveResourceOwner;
-			PG_RE_THROW();
-		}
-		PG_END_TRY();
-		CurrentResourceOwner = saveResourceOwner;
-		MemoryContextSwitchTo(oldcxt);
-	}
-
-	return state;
-}
-
-void
-DestroyTransitionCaptureState(TransitionCaptureState *tcs)
-{
-	if (tcs->tcs_insert_tuplestore != NULL)
-		tuplestore_end(tcs->tcs_insert_tuplestore);
-	if (tcs->tcs_update_tuplestore != NULL)
-		tuplestore_end(tcs->tcs_update_tuplestore);
-	if (tcs->tcs_old_tuplestore != NULL)
-		tuplestore_end(tcs->tcs_old_tuplestore);
-	pfree(tcs);
-}
-
 /*
  * Call a trigger function.
  *
@@ -3338,9 +3273,11 @@ TriggerEnabled(EState *estate, ResultRelInfo *relinfo,
  * during the current transaction tree.  (BEFORE triggers are fired
  * immediately so we don't need any persistent state about them.)  The struct
  * and most of its subsidiary data are kept in TopTransactionContext; however
- * the individual event records are kept in a separate sub-context.  This is
- * done mainly so that it's easy to tell from a memory context dump how much
- * space is being eaten by trigger events.
+ * some data that can be discarded sooner appears in the CurTransactionContext
+ * of the relevant subtransaction.  Also, the individual event records are
+ * kept in a separate sub-context of TopTransactionContext.  This is done
+ * mainly so that it's easy to tell from a memory context dump how much space
+ * is being eaten by trigger events.
  *
  * Because the list of pending events can grow large, we go to some
  * considerable effort to minimize per-event memory consumption.  The event
@@ -3400,6 +3337,13 @@ typedef SetConstraintStateData *SetConstraintState;
  * tuple(s).  This permits storing tuples once regardless of the number of
  * row-level triggers on a foreign table.
  *
+ * Note that we need triggers on foreign tables to be fired in exactly the
+ * order they were queued, so that the tuples come out of the tuplestore in
+ * the right order.  To ensure that, we forbid deferrable (constraint)
+ * triggers on foreign tables.  This also ensures that such triggers do not
+ * get deferred into outer trigger query levels, meaning that it's okay to
+ * destroy the tuplestore at the end of the query level.
+ *
  * Statement-level triggers always bear AFTER_TRIGGER_1CTID, though they
  * require no ctid field.  We lack the flag bit space to neatly represent that
  * distinct case, and it seems unlikely to be worth much trouble.
@@ -3433,7 +3377,7 @@ typedef struct AfterTriggerSharedData
 	Oid			ats_tgoid;		/* the trigger's ID */
 	Oid			ats_relid;		/* the relation it's on */
 	CommandId	ats_firing_id;	/* ID for firing cycle */
-	TransitionCaptureState *ats_transition_capture;
+	struct AfterTriggersTableData *ats_table;	/* transition table access */
 } AfterTriggerSharedData;
 
 typedef struct AfterTriggerEventData *AfterTriggerEvent;
@@ -3505,6 +3449,14 @@ typedef struct AfterTriggerEventList
 #define for_each_event_chunk(eptr, cptr, evtlist) \
 	for_each_chunk(cptr, evtlist) for_each_event(eptr, cptr)
 
+/* Macros for iterating from a start point that might not be list start */
+#define for_each_chunk_from(cptr) \
+	for (; cptr != NULL; cptr = cptr->next)
+#define for_each_event_from(eptr, cptr) \
+	for (; \
+		 (char *) eptr < (cptr)->freeptr; \
+		 eptr = (AfterTriggerEvent) (((char *) eptr) + SizeofTriggerEvent(eptr)))
+
 
 /*
  * All per-transaction data for the AFTER TRIGGERS module.
@@ -3529,60 +3481,107 @@ typedef struct AfterTriggerEventList
  * query_depth is the current depth of nested AfterTriggerBeginQuery calls
  * (-1 when the stack is empty).
  *
- * query_stack[query_depth] is a list of AFTER trigger events queued by the
- * current query (and the query_stack entries below it are lists of trigger
- * events queued by calling queries).  None of these are valid until the
- * matching AfterTriggerEndQuery call occurs.  At that point we fire
- * immediate-mode triggers, and append any deferred events to the main events
- * list.
+ * query_stack[query_depth] is the per-query-level data, including these fields:
  *
- * fdw_tuplestores[query_depth] is a tuplestore containing the foreign tuples
- * needed for the current query.
+ * events is a list of AFTER trigger events queued by the current query.
+ * None of these are valid until the matching AfterTriggerEndQuery call
+ * occurs.  At that point we fire immediate-mode triggers, and append any
+ * deferred events to the main events list.
  *
- * maxquerydepth is just the allocated length of query_stack and the
- * tuplestores.
+ * fdw_tuplestore is a tuplestore containing the foreign-table tuples
+ * needed by events queued by the current query.  (Note: we use just one
+ * tuplestore even though more than one foreign table might be involved.
+ * This is okay because tuplestores don't really care what's in the tuples
+ * they store; but it's possible that someday it'd break.)
  *
- * state_stack is a stack of pointers to saved copies of the SET CONSTRAINTS
- * state data; each subtransaction level that modifies that state first
+ * tables is a List of AfterTriggersTableData structs for target tables
+ * of the current query (see below).
+ *
+ * maxquerydepth is just the allocated length of query_stack.
+ *
+ * trans_stack holds per-subtransaction data, including these fields:
+ *
+ * state is NULL or a pointer to a saved copy of the SET CONSTRAINTS
+ * state data.  Each subtransaction level that modifies that state first
  * saves a copy, which we use to restore the state if we abort.
  *
- * events_stack is a stack of copies of the events head/tail pointers,
+ * events is a copy of the events head/tail pointers,
  * which we use to restore those values during subtransaction abort.
  *
- * depth_stack is a stack of copies of subtransaction-start-time query_depth,
+ * query_depth is the subtransaction-start-time value of query_depth,
  * which we similarly use to clean up at subtransaction abort.
  *
- * firing_stack is a stack of copies of subtransaction-start-time
- * firing_counter.  We use this to recognize which deferred triggers were
- * fired (or marked for firing) within an aborted subtransaction.
+ * firing_counter is the subtransaction-start-time value of firing_counter.
+ * We use this to recognize which deferred triggers were fired (or marked
+ * for firing) within an aborted subtransaction.
  *
  * We use GetCurrentTransactionNestLevel() to determine the correct array
- * index in these stacks.  maxtransdepth is the number of allocated entries in
- * each stack.  (By not keeping our own stack pointer, we can avoid trouble
+ * index in trans_stack.  maxtransdepth is the number of allocated entries in
+ * trans_stack.  (By not keeping our own stack pointer, we can avoid trouble
  * in cases where errors during subxact abort cause multiple invocations
  * of AfterTriggerEndSubXact() at the same nesting depth.)
+ *
+ * We create an AfterTriggersTableData struct for each target table of the
+ * current query, and each operation mode (INSERT/UPDATE/DELETE), that has
+ * either transition tables or AFTER STATEMENT triggers.  This is used to
+ * hold the relevant transition tables, as well as info tracking whether
+ * we already queued the AFTER STATEMENT triggers.  (We use that info to
+ * prevent, as much as possible, firing the same AFTER STATEMENT trigger
+ * more than once per statement.)  These structs, along with the transition
+ * table tuplestores, live in the (sub)transaction's CurTransactionContext.
+ * That's sufficient lifespan because we don't allow transition tables to be
+ * used by deferrable triggers, so they only need to survive until
+ * AfterTriggerEndQuery.
  */
+typedef struct AfterTriggersQueryData AfterTriggersQueryData;
+typedef struct AfterTriggersTransData AfterTriggersTransData;
+typedef struct AfterTriggersTableData AfterTriggersTableData;
+
 typedef struct AfterTriggersData
 {
 	CommandId	firing_counter; /* next firing ID to assign */
 	SetConstraintState state;	/* the active S C state */
 	AfterTriggerEventList events;	/* deferred-event list */
-	int			query_depth;	/* current query list index */
-	AfterTriggerEventList *query_stack; /* events pending from each query */
-	Tuplestorestate **fdw_tuplestores;	/* foreign tuples for one row from
-										 * each query */
-	int			maxquerydepth;	/* allocated len of above array */
 	MemoryContext event_cxt;	/* memory context for events, if any */
 
-	/* these fields are just for resetting at subtrans abort: */
+	/* per-query-level data: */
+	AfterTriggersQueryData *query_stack;	/* array of structs shown below */
+	int			query_depth;	/* current index in above array */
+	int			maxquerydepth;	/* allocated len of above array */
 
-	SetConstraintState *state_stack;	/* stacked S C states */
-	AfterTriggerEventList *events_stack;	/* stacked list pointers */
-	int		   *depth_stack;	/* stacked query_depths */
-	CommandId  *firing_stack;	/* stacked firing_counters */
-	int			maxtransdepth;	/* allocated len of above arrays */
+	/* per-subtransaction-level data: */
+	AfterTriggersTransData *trans_stack;	/* array of structs shown below */
+	int			maxtransdepth;	/* allocated len of above array */
 } AfterTriggersData;
 
+struct AfterTriggersQueryData
+{
+	AfterTriggerEventList events;	/* events pending from this query */
+	Tuplestorestate *fdw_tuplestore;	/* foreign tuples for said events */
+	List	   *tables;			/* list of AfterTriggersTableData, see below */
+};
+
+struct AfterTriggersTransData
+{
+	/* these fields are just for resetting at subtrans abort: */
+	SetConstraintState state;	/* saved S C state, or NULL if not yet saved */
+	AfterTriggerEventList events;	/* saved list pointer */
+	int			query_depth;	/* saved query_depth */
+	CommandId	firing_counter; /* saved firing_counter */
+};
+
+struct AfterTriggersTableData
+{
+	/* relid + cmdType form the lookup key for these structs: */
+	Oid			relid;			/* target table's OID */
+	CmdType		cmdType;		/* event type, CMD_INSERT/UPDATE/DELETE */
+	bool		closed;			/* true when no longer OK to add tuples */
+	bool		stmt_trig_done; /* did we already queue stmt-level triggers? */
+	AfterTriggerEventList stmt_trig_events; /* if so, saved list pointer */
+	Tuplestorestate *old_tuplestore;	/* "old" transition table, if any */
+	Tuplestorestate *new_tuplestore;	/* "new" transition table, if any */
+};
+
 static AfterTriggersData afterTriggers;
 
 static void AfterTriggerExecute(AfterTriggerEvent event,
@@ -3591,38 +3590,41 @@ static void AfterTriggerExecute(AfterTriggerEvent event,
 					Instrumentation *instr,
 					MemoryContext per_tuple_context,
 					TupleTableSlot *trig_tuple_slot1,
-					TupleTableSlot *trig_tuple_slot2,
-					TransitionCaptureState *transition_capture);
+					TupleTableSlot *trig_tuple_slot2);
+static AfterTriggersTableData *GetAfterTriggersTableData(Oid relid,
+						  CmdType cmdType);
+static void AfterTriggerFreeQuery(AfterTriggersQueryData *qs);
 static SetConstraintState SetConstraintStateCreate(int numalloc);
 static SetConstraintState SetConstraintStateCopy(SetConstraintState state);
 static SetConstraintState SetConstraintStateAddItem(SetConstraintState state,
 						  Oid tgoid, bool tgisdeferred);
+static void cancel_prior_stmt_triggers(Oid relid, CmdType cmdType, int tgevent);
 
 
 /*
- * Gets a current query transition tuplestore and initializes it if necessary.
+ * Get the FDW tuplestore for the current trigger query level, creating it
+ * if necessary.
  */
 static Tuplestorestate *
-GetTriggerTransitionTuplestore(Tuplestorestate **tss)
+GetCurrentFDWTuplestore(void)
 {
 	Tuplestorestate *ret;
 
-	ret = tss[afterTriggers.query_depth];
+	ret = afterTriggers.query_stack[afterTriggers.query_depth].fdw_tuplestore;
 	if (ret == NULL)
 	{
 		MemoryContext oldcxt;
 		ResourceOwner saveResourceOwner;
 
 		/*
-		 * Make the tuplestore valid until end of transaction.  This is the
-		 * allocation lifespan of the associated events list, but we really
+		 * Make the tuplestore valid until end of subtransaction.  We really
 		 * only need it until AfterTriggerEndQuery().
 		 */
-		oldcxt = MemoryContextSwitchTo(TopTransactionContext);
+		oldcxt = MemoryContextSwitchTo(CurTransactionContext);
 		saveResourceOwner = CurrentResourceOwner;
 		PG_TRY();
 		{
-			CurrentResourceOwner = TopTransactionResourceOwner;
+			CurrentResourceOwner = CurTransactionResourceOwner;
 			ret = tuplestore_begin_heap(false, false, work_mem);
 		}
 		PG_CATCH();
@@ -3634,7 +3636,7 @@ GetTriggerTransitionTuplestore(Tuplestorestate **tss)
 		CurrentResourceOwner = saveResourceOwner;
 		MemoryContextSwitchTo(oldcxt);
 
-		tss[afterTriggers.query_depth] = ret;
+		afterTriggers.query_stack[afterTriggers.query_depth].fdw_tuplestore = ret;
 	}
 
 	return ret;
@@ -3780,7 +3782,7 @@ afterTriggerAddEvent(AfterTriggerEventList *events,
 		if (newshared->ats_tgoid == evtshared->ats_tgoid &&
 			newshared->ats_relid == evtshared->ats_relid &&
 			newshared->ats_event == evtshared->ats_event &&
-			newshared->ats_transition_capture == evtshared->ats_transition_capture &&
+			newshared->ats_table == evtshared->ats_table &&
 			newshared->ats_firing_id == 0)
 			break;
 	}
@@ -3892,8 +3894,7 @@ AfterTriggerExecute(AfterTriggerEvent event,
 					FmgrInfo *finfo, Instrumentation *instr,
 					MemoryContext per_tuple_context,
 					TupleTableSlot *trig_tuple_slot1,
-					TupleTableSlot *trig_tuple_slot2,
-					TransitionCaptureState *transition_capture)
+					TupleTableSlot *trig_tuple_slot2)
 {
 	AfterTriggerShared evtshared = GetTriggerSharedData(event);
 	Oid			tgoid = evtshared->ats_tgoid;
@@ -3934,9 +3935,7 @@ AfterTriggerExecute(AfterTriggerEvent event,
 	{
 		case AFTER_TRIGGER_FDW_FETCH:
 			{
-				Tuplestorestate *fdw_tuplestore =
-				GetTriggerTransitionTuplestore
-				(afterTriggers.fdw_tuplestores);
+				Tuplestorestate *fdw_tuplestore = GetCurrentFDWTuplestore();
 
 				if (!tuplestore_gettupleslot(fdw_tuplestore, true, false,
 											 trig_tuple_slot1))
@@ -4006,36 +4005,25 @@ AfterTriggerExecute(AfterTriggerEvent event,
 	}
 
 	/*
-	 * Set up the tuplestore information.
+	 * Set up the tuplestore information to let the trigger have access to
+	 * transition tables.  When we first make a transition table available to
+	 * a trigger, mark it "closed" so that it cannot change anymore.  If any
+	 * additional events of the same type get queued in the current trigger
+	 * query level, they'll go into new transition tables.
 	 */
 	LocTriggerData.tg_oldtable = LocTriggerData.tg_newtable = NULL;
-	if (transition_capture != NULL)
+	if (evtshared->ats_table)
 	{
 		if (LocTriggerData.tg_trigger->tgoldtable)
-			LocTriggerData.tg_oldtable = transition_capture->tcs_old_tuplestore;
+		{
+			LocTriggerData.tg_oldtable = evtshared->ats_table->old_tuplestore;
+			evtshared->ats_table->closed = true;
+		}
+
 		if (LocTriggerData.tg_trigger->tgnewtable)
 		{
-			/*
-			 * Currently a trigger with transition tables may only be defined
-			 * for a single event type (here AFTER INSERT or AFTER UPDATE, but
-			 * not AFTER INSERT OR ...).
-			 */
-			Assert((TRIGGER_FOR_INSERT(LocTriggerData.tg_trigger->tgtype) != 0) ^
-				   (TRIGGER_FOR_UPDATE(LocTriggerData.tg_trigger->tgtype) != 0));
-
-			/*
-			 * Show either the insert or update new tuple images, depending on
-			 * which event type the trigger was registered for.  A single
-			 * statement may have produced both in the case of INSERT ... ON
-			 * CONFLICT ... DO UPDATE, and in that case the event determines
-			 * which tuplestore the trigger sees as the NEW TABLE.
-			 */
-			if (TRIGGER_FOR_INSERT(LocTriggerData.tg_trigger->tgtype))
-				LocTriggerData.tg_newtable =
-					transition_capture->tcs_insert_tuplestore;
-			else
-				LocTriggerData.tg_newtable =
-					transition_capture->tcs_update_tuplestore;
+			LocTriggerData.tg_newtable = evtshared->ats_table->new_tuplestore;
+			evtshared->ats_table->closed = true;
 		}
 	}
 
@@ -4245,8 +4233,7 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events,
 				 * won't try to re-fire it.
 				 */
 				AfterTriggerExecute(event, rel, trigdesc, finfo, instr,
-									per_tuple_context, slot1, slot2,
-									evtshared->ats_transition_capture);
+									per_tuple_context, slot1, slot2);
 
 				/*
 				 * Mark the event as done.
@@ -4296,6 +4283,166 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events,
 }
 
 
+/*
+ * GetAfterTriggersTableData
+ *
+ * Find or create an AfterTriggersTableData struct for the specified
+ * trigger event (relation + operation type).  Ignore existing structs
+ * marked "closed"; we don't want to put any additional tuples into them,
+ * nor change their stmt-triggers-fired state.
+ *
+ * Note: the AfterTriggersTableData list is allocated in the current
+ * (sub)transaction's CurTransactionContext.  This is OK because
+ * we don't need it to live past AfterTriggerEndQuery.
+ */
+static AfterTriggersTableData *
+GetAfterTriggersTableData(Oid relid, CmdType cmdType)
+{
+	AfterTriggersTableData *table;
+	AfterTriggersQueryData *qs;
+	MemoryContext oldcxt;
+	ListCell   *lc;
+
+	/* Caller should have ensured query_depth is OK. */
+	Assert(afterTriggers.query_depth >= 0 &&
+		   afterTriggers.query_depth < afterTriggers.maxquerydepth);
+	qs = &afterTriggers.query_stack[afterTriggers.query_depth];
+
+	foreach(lc, qs->tables)
+	{
+		table = (AfterTriggersTableData *) lfirst(lc);
+		if (table->relid == relid && table->cmdType == cmdType &&
+			!table->closed)
+			return table;
+	}
+
+	oldcxt = MemoryContextSwitchTo(CurTransactionContext);
+
+	table = (AfterTriggersTableData *) palloc0(sizeof(AfterTriggersTableData));
+	table->relid = relid;
+	table->cmdType = cmdType;
+	qs->tables = lappend(qs->tables, table);
+
+	MemoryContextSwitchTo(oldcxt);
+
+	return table;
+}
+
+
+/*
+ * MakeTransitionCaptureState
+ *
+ * Make a TransitionCaptureState object for the given TriggerDesc, target
+ * relation, and operation type.  The TCS object holds all the state needed
+ * to decide whether to capture tuples in transition tables.
+ *
+ * If there are no triggers in 'trigdesc' that request relevant transition
+ * tables, then return NULL.
+ *
+ * The resulting object can be passed to the ExecAR* functions.  The caller
+ * should set tcs_map or tcs_original_insert_tuple as appropriate when dealing
+ * with child tables.
+ *
+ * Note that we copy the flags from a parent table into this struct (rather
+ * than subsequently using the relation's TriggerDesc directly) so that we can
+ * use it to control collection of transition tuples from child tables.
+ *
+ * Per SQL spec, all operations of the same kind (INSERT/UPDATE/DELETE)
+ * on the same table during one query should share one transition table.
+ * Therefore, the Tuplestores are owned by an AfterTriggersTableData struct
+ * looked up using the table OID + CmdType, and are merely referenced by
+ * the TransitionCaptureState objects we hand out to callers.
+ */
+TransitionCaptureState *
+MakeTransitionCaptureState(TriggerDesc *trigdesc, Oid relid, CmdType cmdType)
+{
+	TransitionCaptureState *state;
+	bool		need_old,
+				need_new;
+	AfterTriggersTableData *table;
+	MemoryContext oldcxt;
+	ResourceOwner saveResourceOwner;
+
+	if (trigdesc == NULL)
+		return NULL;
+
+	/* Detect which table(s) we need. */
+	switch (cmdType)
+	{
+		case CMD_INSERT:
+			need_old = false;
+			need_new = trigdesc->trig_insert_new_table;
+			break;
+		case CMD_UPDATE:
+			need_old = trigdesc->trig_update_old_table;
+			need_new = trigdesc->trig_update_new_table;
+			break;
+		case CMD_DELETE:
+			need_old = trigdesc->trig_delete_old_table;
+			need_new = false;
+			break;
+		default:
+			elog(ERROR, "unexpected CmdType: %d", (int) cmdType);
+			need_old = need_new = false;	/* keep compiler quiet */
+			break;
+	}
+	if (!need_old && !need_new)
+		return NULL;
+
+	/* Check state, like AfterTriggerSaveEvent. */
+	if (afterTriggers.query_depth < 0)
+		elog(ERROR, "MakeTransitionCaptureState() called outside of query");
+
+	/* Be sure we have enough space to record events at this query depth. */
+	if (afterTriggers.query_depth >= afterTriggers.maxquerydepth)
+		AfterTriggerEnlargeQueryState();
+
+	/*
+	 * Find or create an AfterTriggersTableData struct to hold the
+	 * tuplestore(s).  If there's a matching struct but it's marked closed,
+	 * ignore it; we need a newer one.
+	 *
+	 * Note: the AfterTriggersTableData list, as well as the tuplestores, are
+	 * allocated in the current (sub)transaction's CurTransactionContext, and
+	 * the tuplestores are managed by the (sub)transaction's resource owner.
+	 * This is sufficient lifespan because we do not allow triggers using
+	 * transition tables to be deferrable; they will be fired during
+	 * AfterTriggerEndQuery, after which it's okay to delete the data.
+	 */
+	table = GetAfterTriggersTableData(relid, cmdType);
+
+	/* Now create required tuplestore(s), if we don't have them already. */
+	oldcxt = MemoryContextSwitchTo(CurTransactionContext);
+	saveResourceOwner = CurrentResourceOwner;
+	PG_TRY();
+	{
+		CurrentResourceOwner = CurTransactionResourceOwner;
+		if (need_old && table->old_tuplestore == NULL)
+			table->old_tuplestore = tuplestore_begin_heap(false, false, work_mem);
+		if (need_new && table->new_tuplestore == NULL)
+			table->new_tuplestore = tuplestore_begin_heap(false, false, work_mem);
+	}
+	PG_CATCH();
+	{
+		CurrentResourceOwner = saveResourceOwner;
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+	CurrentResourceOwner = saveResourceOwner;
+	MemoryContextSwitchTo(oldcxt);
+
+	/* Now build the TransitionCaptureState struct, in caller's context */
+	state = (TransitionCaptureState *) palloc0(sizeof(TransitionCaptureState));
+	state->tcs_delete_old_table = trigdesc->trig_delete_old_table;
+	state->tcs_update_old_table = trigdesc->trig_update_old_table;
+	state->tcs_update_new_table = trigdesc->trig_update_new_table;
+	state->tcs_insert_new_table = trigdesc->trig_insert_new_table;
+	state->tcs_private = table;
+
+	return state;
+}
+
+
 /* ----------
  * AfterTriggerBeginXact()
  *
@@ -4319,14 +4466,10 @@ AfterTriggerBeginXact(void)
 	 */
 	Assert(afterTriggers.state == NULL);
 	Assert(afterTriggers.query_stack == NULL);
-	Assert(afterTriggers.fdw_tuplestores == NULL);
 	Assert(afterTriggers.maxquerydepth == 0);
 	Assert(afterTriggers.event_cxt == NULL);
 	Assert(afterTriggers.events.head == NULL);
-	Assert(afterTriggers.state_stack == NULL);
-	Assert(afterTriggers.events_stack == NULL);
-	Assert(afterTriggers.depth_stack == NULL);
-	Assert(afterTriggers.firing_stack == NULL);
+	Assert(afterTriggers.trans_stack == NULL);
 	Assert(afterTriggers.maxtransdepth == 0);
 }
 
@@ -4362,9 +4505,6 @@ AfterTriggerBeginQuery(void)
 void
 AfterTriggerEndQuery(EState *estate)
 {
-	AfterTriggerEventList *events;
-	Tuplestorestate *fdw_tuplestore;
-
 	/* Must be inside a query, too */
 	Assert(afterTriggers.query_depth >= 0);
 
@@ -4393,41 +4533,89 @@ AfterTriggerEndQuery(EState *estate)
 	 * will instead fire any triggers in a dedicated query level.  Foreign key
 	 * enforcement triggers do add to the current query level, thanks to their
 	 * passing fire_triggers = false to SPI_execute_snapshot().  Other
-	 * C-language triggers might do likewise.  Be careful here: firing a
-	 * trigger could result in query_stack being repalloc'd, so we can't save
-	 * its address across afterTriggerInvokeEvents calls.
+	 * C-language triggers might do likewise.
 	 *
 	 * If we find no firable events, we don't have to increment
 	 * firing_counter.
 	 */
 	for (;;)
 	{
-		events = &afterTriggers.query_stack[afterTriggers.query_depth];
-		if (afterTriggerMarkEvents(events, &afterTriggers.events, true))
+		AfterTriggersQueryData *qs;
+
+		/*
+		 * Firing a trigger could result in query_stack being repalloc'd, so
+		 * we must recalculate qs after each afterTriggerInvokeEvents call.
+		 */
+		qs = &afterTriggers.query_stack[afterTriggers.query_depth];
+
+		if (afterTriggerMarkEvents(&qs->events, &afterTriggers.events, true))
 		{
 			CommandId	firing_id = afterTriggers.firing_counter++;
 
 			/* OK to delete the immediate events after processing them */
-			if (afterTriggerInvokeEvents(events, firing_id, estate, true))
+			if (afterTriggerInvokeEvents(&qs->events, firing_id, estate, true))
 				break;			/* all fired */
 		}
 		else
 			break;
 	}
 
-	/* Release query-local storage for events, including tuplestore if any */
-	fdw_tuplestore = afterTriggers.fdw_tuplestores[afterTriggers.query_depth];
-	if (fdw_tuplestore)
-	{
-		tuplestore_end(fdw_tuplestore);
-		afterTriggers.fdw_tuplestores[afterTriggers.query_depth] = NULL;
-	}
-	afterTriggerFreeEventList(&afterTriggers.query_stack[afterTriggers.query_depth]);
+	/* Release query-level-local storage, including tuplestores if any */
+	AfterTriggerFreeQuery(&afterTriggers.query_stack[afterTriggers.query_depth]);
 
 	afterTriggers.query_depth--;
 }
 
 
+/*
+ * AfterTriggerFreeQuery
+ *	Release subsidiary storage for a trigger query level.
+ *	This includes closing down tuplestores.
+ *	Note: it's important for this to be safe if interrupted by an error
+ *	and then called again for the same query level.
+ */
+static void
+AfterTriggerFreeQuery(AfterTriggersQueryData *qs)
+{
+	Tuplestorestate *ts;
+	List	   *tables;
+	ListCell   *lc;
+
+	/* Drop the trigger events */
+	afterTriggerFreeEventList(&qs->events);
+
+	/* Drop FDW tuplestore if any */
+	ts = qs->fdw_tuplestore;
+	qs->fdw_tuplestore = NULL;
+	if (ts)
+		tuplestore_end(ts);
+
+	/* Release per-table subsidiary storage */
+	tables = qs->tables;
+	foreach(lc, tables)
+	{
+		AfterTriggersTableData *table = (AfterTriggersTableData *) lfirst(lc);
+
+		ts = table->old_tuplestore;
+		table->old_tuplestore = NULL;
+		if (ts)
+			tuplestore_end(ts);
+		ts = table->new_tuplestore;
+		table->new_tuplestore = NULL;
+		if (ts)
+			tuplestore_end(ts);
+	}
+
+	/*
+	 * Now free the AfterTriggersTableData structs and list cells.  Reset list
+	 * pointer first; if list_free_deep somehow gets an error, better to leak
+	 * that storage than have an infinite loop.
+	 */
+	qs->tables = NIL;
+	list_free_deep(tables);
+}
+
+
 /* ----------
  * AfterTriggerFireDeferred()
  *
@@ -4521,10 +4709,7 @@ AfterTriggerEndXact(bool isCommit)
 	 * large, we let the eventual reset of TopTransactionContext free the
 	 * memory instead of doing it here.
 	 */
-	afterTriggers.state_stack = NULL;
-	afterTriggers.events_stack = NULL;
-	afterTriggers.depth_stack = NULL;
-	afterTriggers.firing_stack = NULL;
+	afterTriggers.trans_stack = NULL;
 	afterTriggers.maxtransdepth = 0;
 
 
@@ -4534,7 +4719,6 @@ AfterTriggerEndXact(bool isCommit)
 	 * memory here.
 	 */
 	afterTriggers.query_stack = NULL;
-	afterTriggers.fdw_tuplestores = NULL;
 	afterTriggers.maxquerydepth = 0;
 	afterTriggers.state = NULL;
 
@@ -4553,48 +4737,28 @@ AfterTriggerBeginSubXact(void)
 	int			my_level = GetCurrentTransactionNestLevel();
 
 	/*
-	 * Allocate more space in the stacks if needed.  (Note: because the
+	 * Allocate more space in the trans_stack if needed.  (Note: because the
 	 * minimum nest level of a subtransaction is 2, we waste the first couple
-	 * entries of each array; not worth the notational effort to avoid it.)
+	 * entries of the array; not worth the notational effort to avoid it.)
 	 */
 	while (my_level >= afterTriggers.maxtransdepth)
 	{
 		if (afterTriggers.maxtransdepth == 0)
 		{
-			MemoryContext old_cxt;
-
-			old_cxt = MemoryContextSwitchTo(TopTransactionContext);
-
-#define DEFTRIG_INITALLOC 8
-			afterTriggers.state_stack = (SetConstraintState *)
-				palloc(DEFTRIG_INITALLOC * sizeof(SetConstraintState));
-			afterTriggers.events_stack = (AfterTriggerEventList *)
-				palloc(DEFTRIG_INITALLOC * sizeof(AfterTriggerEventList));
-			afterTriggers.depth_stack = (int *)
-				palloc(DEFTRIG_INITALLOC * sizeof(int));
-			afterTriggers.firing_stack = (CommandId *)
-				palloc(DEFTRIG_INITALLOC * sizeof(CommandId));
-			afterTriggers.maxtransdepth = DEFTRIG_INITALLOC;
-
-			MemoryContextSwitchTo(old_cxt);
+			/* Arbitrarily initialize for max of 8 subtransaction levels */
+			afterTriggers.trans_stack = (AfterTriggersTransData *)
+				MemoryContextAlloc(TopTransactionContext,
+								   8 * sizeof(AfterTriggersTransData));
+			afterTriggers.maxtransdepth = 8;
 		}
 		else
 		{
-			/* repalloc will keep the stacks in the same context */
+			/* repalloc will keep the stack in the same context */
 			int			new_alloc = afterTriggers.maxtransdepth * 2;
 
-			afterTriggers.state_stack = (SetConstraintState *)
-				repalloc(afterTriggers.state_stack,
-						 new_alloc * sizeof(SetConstraintState));
-			afterTriggers.events_stack = (AfterTriggerEventList *)
-				repalloc(afterTriggers.events_stack,
-						 new_alloc * sizeof(AfterTriggerEventList));
-			afterTriggers.depth_stack = (int *)
-				repalloc(afterTriggers.depth_stack,
-						 new_alloc * sizeof(int));
-			afterTriggers.firing_stack = (CommandId *)
-				repalloc(afterTriggers.firing_stack,
-						 new_alloc * sizeof(CommandId));
+			afterTriggers.trans_stack = (AfterTriggersTransData *)
+				repalloc(afterTriggers.trans_stack,
+						 new_alloc * sizeof(AfterTriggersTransData));
 			afterTriggers.maxtransdepth = new_alloc;
 		}
 	}
@@ -4604,10 +4768,10 @@ AfterTriggerBeginSubXact(void)
 	 * is not saved until/unless changed.  Likewise, we don't make a
 	 * per-subtransaction event context until needed.
 	 */
-	afterTriggers.state_stack[my_level] = NULL;
-	afterTriggers.events_stack[my_level] = afterTriggers.events;
-	afterTriggers.depth_stack[my_level] = afterTriggers.query_depth;
-	afterTriggers.firing_stack[my_level] = afterTriggers.firing_counter;
+	afterTriggers.trans_stack[my_level].state = NULL;
+	afterTriggers.trans_stack[my_level].events = afterTriggers.events;
+	afterTriggers.trans_stack[my_level].query_depth = afterTriggers.query_depth;
+	afterTriggers.trans_stack[my_level].firing_counter = afterTriggers.firing_counter;
 }
 
 /*
@@ -4631,70 +4795,58 @@ AfterTriggerEndSubXact(bool isCommit)
 	{
 		Assert(my_level < afterTriggers.maxtransdepth);
 		/* If we saved a prior state, we don't need it anymore */
-		state = afterTriggers.state_stack[my_level];
+		state = afterTriggers.trans_stack[my_level].state;
 		if (state != NULL)
 			pfree(state);
 		/* this avoids double pfree if error later: */
-		afterTriggers.state_stack[my_level] = NULL;
+		afterTriggers.trans_stack[my_level].state = NULL;
 		Assert(afterTriggers.query_depth ==
-			   afterTriggers.depth_stack[my_level]);
+			   afterTriggers.trans_stack[my_level].query_depth);
 	}
 	else
 	{
 		/*
 		 * Aborting.  It is possible subxact start failed before calling
 		 * AfterTriggerBeginSubXact, in which case we mustn't risk touching
-		 * stack levels that aren't there.
+		 * trans_stack levels that aren't there.
 		 */
 		if (my_level >= afterTriggers.maxtransdepth)
 			return;
 
 		/*
-		 * Release any event lists from queries being aborted, and restore
+		 * Release query-level storage for queries being aborted, and restore
 		 * query_depth to its pre-subxact value.  This assumes that a
 		 * subtransaction will not add events to query levels started in a
 		 * earlier transaction state.
 		 */
-		while (afterTriggers.query_depth > afterTriggers.depth_stack[my_level])
+		while (afterTriggers.query_depth > afterTriggers.trans_stack[my_level].query_depth)
 		{
 			if (afterTriggers.query_depth < afterTriggers.maxquerydepth)
-			{
-				Tuplestorestate *ts;
-
-				ts = afterTriggers.fdw_tuplestores[afterTriggers.query_depth];
-				if (ts)
-				{
-					tuplestore_end(ts);
-					afterTriggers.fdw_tuplestores[afterTriggers.query_depth] = NULL;
-				}
-
-				afterTriggerFreeEventList(&afterTriggers.query_stack[afterTriggers.query_depth]);
-			}
-
+				AfterTriggerFreeQuery(&afterTriggers.query_stack[afterTriggers.query_depth]);
 			afterTriggers.query_depth--;
 		}
 		Assert(afterTriggers.query_depth ==
-			   afterTriggers.depth_stack[my_level]);
+			   afterTriggers.trans_stack[my_level].query_depth);
 
 		/*
 		 * Restore the global deferred-event list to its former length,
 		 * discarding any events queued by the subxact.
 		 */
 		afterTriggerRestoreEventList(&afterTriggers.events,
-									 &afterTriggers.events_stack[my_level]);
+									 &afterTriggers.trans_stack[my_level].events);
 
 		/*
 		 * Restore the trigger state.  If the saved state is NULL, then this
 		 * subxact didn't save it, so it doesn't need restoring.
 		 */
-		state = afterTriggers.state_stack[my_level];
+		state = afterTriggers.trans_stack[my_level].state;
 		if (state != NULL)
 		{
 			pfree(afterTriggers.state);
 			afterTriggers.state = state;
 		}
 		/* this avoids double pfree if error later: */
-		afterTriggers.state_stack[my_level] = NULL;
+		afterTriggers.trans_stack[my_level].state = NULL;
 
 		/*
 		 * Scan for any remaining deferred events that were marked DONE or IN
@@ -4704,7 +4856,7 @@ AfterTriggerEndSubXact(bool isCommit)
 		 * (This essentially assumes that the current subxact includes all
 		 * subxacts started after it.)
 		 */
-		subxact_firing_id = afterTriggers.firing_stack[my_level];
+		subxact_firing_id = afterTriggers.trans_stack[my_level].firing_counter;
 		for_each_event_chunk(event, chunk, afterTriggers.events)
 		{
 			AfterTriggerShared evtshared = GetTriggerSharedData(event);
@@ -4740,12 +4892,9 @@ AfterTriggerEnlargeQueryState(void)
 	{
 		int			new_alloc = Max(afterTriggers.query_depth + 1, 8);
 
-		afterTriggers.query_stack = (AfterTriggerEventList *)
+		afterTriggers.query_stack = (AfterTriggersQueryData *)
 			MemoryContextAlloc(TopTransactionContext,
-							   new_alloc * sizeof(AfterTriggerEventList));
-		afterTriggers.fdw_tuplestores = (Tuplestorestate **)
-			MemoryContextAllocZero(TopTransactionContext,
-								   new_alloc * sizeof(Tuplestorestate *));
+							   new_alloc * sizeof(AfterTriggersQueryData));
 		afterTriggers.maxquerydepth = new_alloc;
 	}
 	else
@@ -4755,27 +4904,22 @@ AfterTriggerEnlargeQueryState(void)
 		int			new_alloc = Max(afterTriggers.query_depth + 1,
 									old_alloc * 2);
 
-		afterTriggers.query_stack = (AfterTriggerEventList *)
+		afterTriggers.query_stack = (AfterTriggersQueryData *)
 			repalloc(afterTriggers.query_stack,
-					 new_alloc * sizeof(AfterTriggerEventList));
-		afterTriggers.fdw_tuplestores = (Tuplestorestate **)
-			repalloc(afterTriggers.fdw_tuplestores,
-					 new_alloc * sizeof(Tuplestorestate *));
-		/* Clear newly-allocated slots for subsequent lazy initialization. */
-		memset(afterTriggers.fdw_tuplestores + old_alloc,
-			   0, (new_alloc - old_alloc) * sizeof(Tuplestorestate *));
+					 new_alloc * sizeof(AfterTriggersQueryData));
 		afterTriggers.maxquerydepth = new_alloc;
 	}
 
-	/* Initialize new query lists to empty */
+	/* Initialize new array entries to empty */
 	while (init_depth < afterTriggers.maxquerydepth)
 	{
-		AfterTriggerEventList *events;
+		AfterTriggersQueryData *qs = &afterTriggers.query_stack[init_depth];
 
-		events = &afterTriggers.query_stack[init_depth];
-		events->head = NULL;
-		events->tail = NULL;
-		events->tailfree = NULL;
+		qs->events.head = NULL;
+		qs->events.tail = NULL;
+		qs->events.tailfree = NULL;
+		qs->fdw_tuplestore = NULL;
+		qs->tables = NIL;
 
 		++init_depth;
 	}
@@ -4873,9 +5017,9 @@ AfterTriggerSetState(ConstraintsSetStmt *stmt)
 	 * save it so it can be restored if the subtransaction aborts.
 	 */
 	if (my_level > 1 &&
-		afterTriggers.state_stack[my_level] == NULL)
+		afterTriggers.trans_stack[my_level].state == NULL)
 	{
-		afterTriggers.state_stack[my_level] =
+		afterTriggers.trans_stack[my_level].state =
 			SetConstraintStateCopy(afterTriggers.state);
 	}
 
@@ -5184,7 +5328,7 @@ AfterTriggerPendingOnRel(Oid relid)
 	 */
 	for (depth = 0; depth <= afterTriggers.query_depth && depth < afterTriggers.maxquerydepth; depth++)
 	{
-		for_each_event_chunk(event, chunk, afterTriggers.query_stack[depth])
+		for_each_event_chunk(event, chunk, afterTriggers.query_stack[depth].events)
 		{
 			AfterTriggerShared evtshared = GetTriggerSharedData(event);
 
@@ -5229,7 +5373,7 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 	TriggerDesc *trigdesc = relinfo->ri_TrigDesc;
 	AfterTriggerEventData new_event;
 	AfterTriggerSharedData new_shared;
-	char		relkind = relinfo->ri_RelationDesc->rd_rel->relkind;
+	char		relkind = rel->rd_rel->relkind;
 	int			tgtype_event;
 	int			tgtype_level;
 	int			i;
@@ -5266,7 +5410,7 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 			Tuplestorestate *old_tuplestore;
 
 			Assert(oldtup != NULL);
-			old_tuplestore = transition_capture->tcs_old_tuplestore;
+			old_tuplestore = transition_capture->tcs_private->old_tuplestore;
 
 			if (map != NULL)
 			{
@@ -5284,10 +5428,7 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 			Tuplestorestate *new_tuplestore;
 
 			Assert(newtup != NULL);
-			if (event == TRIGGER_EVENT_INSERT)
-				new_tuplestore = transition_capture->tcs_insert_tuplestore;
-			else
-				new_tuplestore = transition_capture->tcs_update_tuplestore;
+			new_tuplestore = transition_capture->tcs_private->new_tuplestore;
 
 			if (original_insert_tuple != NULL)
 				tuplestore_puttuple(new_tuplestore, original_insert_tuple);
@@ -5316,6 +5457,11 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 	 * The event code will be used both as a bitmask and an array offset, so
 	 * validation is important to make sure we don't walk off the edge of our
 	 * arrays.
+	 *
+	 * Also, if we're considering statement-level triggers, check whether we
+	 * already queued a set of them for this event, and cancel the prior set
+	 * if so.  This preserves the behavior that statement-level triggers fire
+	 * just once per statement and fire after row-level triggers.
 	 */
 	switch (event)
 	{
@@ -5334,6 +5480,8 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 				Assert(newtup == NULL);
 				ItemPointerSetInvalid(&(new_event.ate_ctid1));
 				ItemPointerSetInvalid(&(new_event.ate_ctid2));
+				cancel_prior_stmt_triggers(RelationGetRelid(rel),
+										   CMD_INSERT, event);
 			}
 			break;
 		case TRIGGER_EVENT_DELETE:
@@ -5351,6 +5499,8 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 				Assert(newtup == NULL);
 				ItemPointerSetInvalid(&(new_event.ate_ctid1));
 				ItemPointerSetInvalid(&(new_event.ate_ctid2));
+				cancel_prior_stmt_triggers(RelationGetRelid(rel),
+										   CMD_DELETE, event);
 			}
 			break;
 		case TRIGGER_EVENT_UPDATE:
@@ -5368,6 +5518,8 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 				Assert(newtup == NULL);
 				ItemPointerSetInvalid(&(new_event.ate_ctid1));
 				ItemPointerSetInvalid(&(new_event.ate_ctid2));
+				cancel_prior_stmt_triggers(RelationGetRelid(rel),
+										   CMD_UPDATE, event);
 			}
 			break;
 		case TRIGGER_EVENT_TRUNCATE:
@@ -5407,9 +5559,7 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 		{
 			if (fdw_tuplestore == NULL)
 			{
-				fdw_tuplestore =
-					GetTriggerTransitionTuplestore
-					(afterTriggers.fdw_tuplestores);
+				fdw_tuplestore = GetCurrentFDWTuplestore();
 				new_event.ate_flags = AFTER_TRIGGER_FDW_FETCH;
 			}
 			else
@@ -5465,6 +5615,8 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 
 		/*
 		 * Fill in event structure and add it to the current query's queue.
+		 * Note we set ats_table to NULL whenever this trigger doesn't use
+		 * transition tables, to improve sharability of the shared event data.
 		 */
 		new_shared.ats_event =
 			(event & TRIGGER_EVENT_OPMASK) |
@@ -5474,11 +5626,13 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 		new_shared.ats_tgoid = trigger->tgoid;
 		new_shared.ats_relid = RelationGetRelid(rel);
 		new_shared.ats_firing_id = 0;
-		/* deferrable triggers cannot access transition data */
-		new_shared.ats_transition_capture =
-			trigger->tgdeferrable ? NULL : transition_capture;
+		if ((trigger->tgoldtable || trigger->tgnewtable) &&
+			transition_capture != NULL)
+			new_shared.ats_table = transition_capture->tcs_private;
+		else
+			new_shared.ats_table = NULL;
 
-		afterTriggerAddEvent(&afterTriggers.query_stack[afterTriggers.query_depth],
+		afterTriggerAddEvent(&afterTriggers.query_stack[afterTriggers.query_depth].events,
 							 &new_event, &new_shared);
 	}
 
@@ -5496,6 +5650,100 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 	}
 }
 
+/*
+ * If we previously queued a set of AFTER STATEMENT triggers for the given
+ * relation + operation, and they've not been fired yet, cancel them.  The
+ * caller will queue a fresh set that's after any row-level triggers that may
+ * have been queued by the current sub-statement, preserving (as much as
+ * possible) the property that AFTER ROW triggers fire before AFTER STATEMENT
+ * triggers, and that the latter only fire once.  This deals with the
+ * situation where several FK enforcement triggers sequentially queue triggers
+ * for the same table into the same trigger query level.  We can't fully
+ * prevent odd behavior though: if there are AFTER ROW triggers taking
+ * transition tables, we don't want to change the transition tables once the
+ * first such trigger has seen them.  In such a case, any additional events
+ * will result in creating new transition tables and allowing new firings of
+ * statement triggers.
+ *
+ * This also saves the current event list location so that a later invocation
+ * of this function can cheaply find the triggers we're about to queue and
+ * cancel them.
+ */
+static void
+cancel_prior_stmt_triggers(Oid relid, CmdType cmdType, int tgevent)
+{
+	AfterTriggersTableData *table;
+	AfterTriggersQueryData *qs = &afterTriggers.query_stack[afterTriggers.query_depth];
+
+	/*
+	 * We keep this state in the AfterTriggersTableData that also holds
+	 * transition tables for the relation + operation.  In this way, if we are
+	 * forced to make a new set of transition tables because more tuples get
+	 * entered after we've already fired triggers, we will allow a new set of
+	 * statement triggers to get queued without canceling the old ones.
+	 */
+	table = GetAfterTriggersTableData(relid, cmdType);
+
+	if (table->stmt_trig_done)
+	{
+		/*
+		 * We want to start scanning from the tail location that existed just
+		 * before we inserted any statement triggers.  But the events list
+		 * might've been entirely empty then, in which case scan from the
+		 * current head.
+		 */
+		AfterTriggerEvent event;
+		AfterTriggerEventChunk *chunk;
+
+		if (table->stmt_trig_events.tail)
+		{
+			chunk = table->stmt_trig_events.tail;
+			event = (AfterTriggerEvent) table->stmt_trig_events.tailfree;
+		}
+		else
+		{
+			chunk = qs->events.head;
+			event = NULL;
+		}
+
+		for_each_chunk_from(chunk)
+		{
+			if (event == NULL)
+				event = (AfterTriggerEvent) CHUNK_DATA_START(chunk);
+			for_each_event_from(event, chunk)
+			{
+				AfterTriggerShared evtshared = GetTriggerSharedData(event);
+
+				/*
+				 * Exit loop when we reach events that aren't AS triggers for
+				 * the target relation.
+				 */
+				if (evtshared->ats_relid != relid)
+					goto done;
+				if ((evtshared->ats_event & TRIGGER_EVENT_OPMASK) != tgevent)
+					goto done;
+				if (!TRIGGER_FIRED_FOR_STATEMENT(evtshared->ats_event))
+					goto done;
+				if (!TRIGGER_FIRED_AFTER(evtshared->ats_event))
+					goto done;
+				/* OK, mark it DONE */
+				event->ate_flags &= ~AFTER_TRIGGER_IN_PROGRESS;
+				event->ate_flags |= AFTER_TRIGGER_DONE;
+			}
+			/* signal we must reinitialize event ptr for next chunk */
+			event = NULL;
+		}
+	}
+done:
+
+	/* In any case, save current insertion point for next time */
+	table->stmt_trig_done = true;
+	table->stmt_trig_events = qs->events;
+}
+
+/*
+ * SQL function pg_trigger_depth()
+ */
 Datum
 pg_trigger_depth(PG_FUNCTION_ARGS)
 {
diff --git a/src/backend/executor/README b/src/backend/executor/README
index a0045067fb8..b3e74aa1a54 100644
--- a/src/backend/executor/README
+++ b/src/backend/executor/README
@@ -241,11 +241,11 @@ This is a sketch of control flow for full query processing:
 		CreateExecutorState
 			creates per-query context
 		switch to per-query context to run ExecInitNode
+		AfterTriggerBeginQuery
 		ExecInitNode --- recursively scans plan tree
 			CreateExprContext
 				creates per-tuple context
 			ExecInitExpr
-		AfterTriggerBeginQuery
 
 	ExecutorRun
 		ExecProcNode --- recursively called in per-query context
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 9dcc358ec27..396b7a1e83f 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -251,11 +251,6 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags)
 	estate->es_top_eflags = eflags;
 	estate->es_instrument = queryDesc->instrument_options;
 
-	/*
-	 * Initialize the plan state tree
-	 */
-	InitPlan(queryDesc, eflags);
-
 	/*
 	 * Set up an AFTER-trigger statement context, unless told not to, or
 	 * unless it's EXPLAIN-only mode (when ExecutorFinish won't be called).
@@ -263,6 +258,11 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags)
 	if (!(eflags & (EXEC_FLAG_SKIP_TRIGGERS | EXEC_FLAG_EXPLAIN_ONLY)))
 		AfterTriggerBeginQuery();
 
+	/*
+	 * Initialize the plan state tree
+	 */
+	InitPlan(queryDesc, eflags);
+
 	MemoryContextSwitchTo(oldcontext);
 }
 
@@ -1174,6 +1174,7 @@ CheckValidResultRel(ResultRelInfo *resultRelInfo, CmdType operation)
 			switch (operation)
 			{
 				case CMD_INSERT:
+
 					/*
 					 * If foreign partition to do tuple-routing for, skip the
 					 * check; it's disallowed elsewhere.
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index d78e868154e..7b5214c9996 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -342,6 +342,9 @@ ExecInsert(ModifyTableState *mtstate,
 				mtstate->mt_transition_capture->tcs_map = NULL;
 			}
 		}
+		if (mtstate->mt_oc_transition_capture != NULL)
+			mtstate->mt_oc_transition_capture->tcs_map =
+				mtstate->mt_transition_tupconv_maps[leaf_part_index];
 
 		/*
 		 * We might need to convert from the parent rowtype to the partition
@@ -1157,6 +1160,8 @@ lreplace:;
 	/* AFTER ROW UPDATE Triggers */
 	ExecARUpdateTriggers(estate, resultRelInfo, tupleid, oldtuple, tuple,
 						 recheckIndexes,
+						 mtstate->operation == CMD_INSERT ?
+						 mtstate->mt_oc_transition_capture :
 						 mtstate->mt_transition_capture);
 
 	list_free(recheckIndexes);
@@ -1443,7 +1448,7 @@ fireASTriggers(ModifyTableState *node)
 			if (node->mt_onconflict == ONCONFLICT_UPDATE)
 				ExecASUpdateTriggers(node->ps.state,
 									 resultRelInfo,
-									 node->mt_transition_capture);
+									 node->mt_oc_transition_capture);
 			ExecASInsertTriggers(node->ps.state, resultRelInfo,
 								 node->mt_transition_capture);
 			break;
@@ -1473,14 +1478,24 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate)
 
 	/* Check for transition tables on the directly targeted relation. */
 	mtstate->mt_transition_capture =
-		MakeTransitionCaptureState(targetRelInfo->ri_TrigDesc);
+		MakeTransitionCaptureState(targetRelInfo->ri_TrigDesc,
+								   RelationGetRelid(targetRelInfo->ri_RelationDesc),
+								   mtstate->operation);
+	if (mtstate->operation == CMD_INSERT &&
+		mtstate->mt_onconflict == ONCONFLICT_UPDATE)
+		mtstate->mt_oc_transition_capture =
+			MakeTransitionCaptureState(targetRelInfo->ri_TrigDesc,
+									   RelationGetRelid(targetRelInfo->ri_RelationDesc),
+									   CMD_UPDATE);
 
 	/*
 	 * If we found that we need to collect transition tuples then we may also
 	 * need tuple conversion maps for any children that have TupleDescs that
-	 * aren't compatible with the tuplestores.
+	 * aren't compatible with the tuplestores.  (We can share these maps
+	 * between the regular and ON CONFLICT cases.)
 	 */
-	if (mtstate->mt_transition_capture != NULL)
+	if (mtstate->mt_transition_capture != NULL ||
+		mtstate->mt_oc_transition_capture != NULL)
 	{
 		ResultRelInfo *resultRelInfos;
 		int			numResultRelInfos;
@@ -1521,10 +1536,12 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate)
 		/*
 		 * Install the conversion map for the first plan for UPDATE and DELETE
 		 * operations.  It will be advanced each time we switch to the next
-		 * plan.  (INSERT operations set it every time.)
+		 * plan.  (INSERT operations set it every time, so we need not update
+		 * mtstate->mt_oc_transition_capture here.)
 		 */
-		mtstate->mt_transition_capture->tcs_map =
-			mtstate->mt_transition_tupconv_maps[0];
+		if (mtstate->mt_transition_capture)
+			mtstate->mt_transition_capture->tcs_map =
+				mtstate->mt_transition_tupconv_maps[0];
 	}
 }
 
@@ -1628,13 +1645,19 @@ ExecModifyTable(PlanState *pstate)
 				estate->es_result_relation_info = resultRelInfo;
 				EvalPlanQualSetPlan(&node->mt_epqstate, subplanstate->plan,
 									node->mt_arowmarks[node->mt_whichplan]);
+				/* Prepare to convert transition tuples from this child. */
 				if (node->mt_transition_capture != NULL)
 				{
-					/* Prepare to convert transition tuples from this child. */
 					Assert(node->mt_transition_tupconv_maps != NULL);
 					node->mt_transition_capture->tcs_map =
 						node->mt_transition_tupconv_maps[node->mt_whichplan];
 				}
+				if (node->mt_oc_transition_capture != NULL)
+				{
+					Assert(node->mt_transition_tupconv_maps != NULL);
+					node->mt_oc_transition_capture->tcs_map =
+						node->mt_transition_tupconv_maps[node->mt_whichplan];
+				}
 				continue;
 			}
 			else
@@ -1933,8 +1956,12 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 		mtstate->mt_partition_tuple_slot = partition_tuple_slot;
 	}
 
-	/* Build state for collecting transition tuples */
-	ExecSetupTransitionCaptureState(mtstate, estate);
+	/*
+	 * Build state for collecting transition tuples.  This requires having a
+	 * valid trigger query context, so skip it in explain-only mode.
+	 */
+	if (!(eflags & EXEC_FLAG_EXPLAIN_ONLY))
+		ExecSetupTransitionCaptureState(mtstate, estate);
 
 	/*
 	 * Initialize any WITH CHECK OPTION constraints if needed.
@@ -2317,16 +2344,6 @@ ExecEndModifyTable(ModifyTableState *node)
 {
 	int			i;
 
-	/*
-	 * Free transition tables, unless this query is being run in
-	 * EXEC_FLAG_SKIP_TRIGGERS mode, which means that it may have queued AFTER
-	 * triggers that won't be run till later.  In that case we'll just leak
-	 * the transition tables till end of (sub)transaction.
-	 */
-	if (node->mt_transition_capture != NULL &&
-		!(node->ps.state->es_top_eflags & EXEC_FLAG_SKIP_TRIGGERS))
-		DestroyTransitionCaptureState(node->mt_transition_capture);
-
 	/*
 	 * Allow any FDWs to shut down
 	 */
diff --git a/src/include/commands/trigger.h b/src/include/commands/trigger.h
index aeb363f13e8..adbcfa1297a 100644
--- a/src/include/commands/trigger.h
+++ b/src/include/commands/trigger.h
@@ -43,13 +43,21 @@ typedef struct TriggerData
 
 /*
  * The state for capturing old and new tuples into transition tables for a
- * single ModifyTable node.
+ * single ModifyTable node (or other operation source, e.g. copy.c).
+ *
+ * This is per-caller to avoid conflicts in setting tcs_map or
+ * tcs_original_insert_tuple.  Note, however, that the pointed-to
+ * private data may be shared across multiple callers.
  */
+struct AfterTriggersTableData;	/* private in trigger.c */
+
 typedef struct TransitionCaptureState
 {
 	/*
 	 * Is there at least one trigger specifying each transition relation on
 	 * the relation explicitly named in the DML statement or COPY command?
+	 * Note: in current usage, these flags could be part of the private state,
+	 * but it seems possibly useful to let callers see them.
 	 */
 	bool		tcs_delete_old_table;
 	bool		tcs_update_old_table;
@@ -60,7 +68,7 @@ typedef struct TransitionCaptureState
 	 * For UPDATE and DELETE, AfterTriggerSaveEvent may need to convert the
 	 * new and old tuples from a child table's format to the format of the
 	 * relation named in a query so that it is compatible with the transition
-	 * tuplestores.
+	 * tuplestores.  The caller must store the conversion map here if so.
 	 */
 	TupleConversionMap *tcs_map;
 
@@ -74,17 +82,9 @@ typedef struct TransitionCaptureState
 	HeapTuple	tcs_original_insert_tuple;
 
 	/*
-	 * The tuplestores backing the transition tables.  We use separate
-	 * tuplestores for INSERT and UPDATE, because INSERT ... ON CONFLICT ...
-	 * DO UPDATE causes INSERT and UPDATE triggers to fire and needs a way to
-	 * keep track of the new tuple images resulting from the two cases
-	 * separately.  We only need a single old image tuplestore, because there
-	 * is no statement that can both update and delete at the same time.
+	 * Private data including the tuplestore(s) into which to insert tuples.
 	 */
-	Tuplestorestate *tcs_old_tuplestore;	/* for DELETE and UPDATE old
-											 * images */
-	Tuplestorestate *tcs_insert_tuplestore; /* for INSERT new images */
-	Tuplestorestate *tcs_update_tuplestore; /* for UPDATE new images */
+	struct AfterTriggersTableData *tcs_private;
 } TransitionCaptureState;
 
 /*
@@ -174,8 +174,9 @@ extern void RelationBuildTriggers(Relation relation);
 extern TriggerDesc *CopyTriggerDesc(TriggerDesc *trigdesc);
 
 extern const char *FindTriggerIncompatibleWithInheritance(TriggerDesc *trigdesc);
-extern TransitionCaptureState *MakeTransitionCaptureState(TriggerDesc *trigdesc);
-extern void DestroyTransitionCaptureState(TransitionCaptureState *tcs);
+
+extern TransitionCaptureState *MakeTransitionCaptureState(TriggerDesc *trigdesc,
+						   Oid relid, CmdType cmdType);
 
 extern void FreeTriggerDesc(TriggerDesc *trigdesc);
 
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index a46a56ebc10..0f5d47ba8e4 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -983,7 +983,9 @@ typedef struct ModifyTableState
 	/* Per partition tuple conversion map */
 	TupleTableSlot *mt_partition_tuple_slot;
 	struct TransitionCaptureState *mt_transition_capture;
-	/* controls transition table population */
+	/* controls transition table population for specified operation */
+	struct TransitionCaptureState *mt_oc_transition_capture;
+	/* controls transition table population for INSERT...ON CONFLICT UPDATE */
 	TupleConversionMap **mt_transition_tupconv_maps;
 	/* Per plan/partition tuple conversion */
 } ModifyTableState;
diff --git a/src/test/regress/expected/triggers.out b/src/test/regress/expected/triggers.out
index 620fac1e2c5..3ab6be3421c 100644
--- a/src/test/regress/expected/triggers.out
+++ b/src/test/regress/expected/triggers.out
@@ -2217,6 +2217,23 @@ with wcte as (insert into table1 values (42))
   insert into table2 values ('hello world');
 NOTICE:  trigger = table2_trig, new table = ("hello world")
 NOTICE:  trigger = table1_trig, new table = (42)
+with wcte as (insert into table1 values (43))
+  insert into table1 values (44);
+NOTICE:  trigger = table1_trig, new table = (43), (44)
+select * from table1;
+ a  
+----
+ 42
+ 44
+ 43
+(3 rows)
+
+select * from table2;
+      a      
+-------------
+ hello world
+(1 row)
+
 drop table table1;
 drop table table2;
 --
@@ -2256,6 +2273,14 @@ create trigger my_table_multievent_trig
   after insert or update on my_table referencing new table as new_table
   for each statement execute procedure dump_insert();
 ERROR:  transition tables cannot be specified for triggers with more than one event
+--
+-- Verify that you can't create a trigger with transition tables with
+-- a column list.
+--
+create trigger my_table_col_update_trig
+  after update of b on my_table referencing new table as new_table
+  for each statement execute procedure dump_insert();
+ERROR:  transition tables cannot be specified for triggers with column lists
 drop table my_table;
 --
 -- Test firing of triggers with transition tables by foreign key cascades
@@ -2299,8 +2324,7 @@ select * from trig_table;
 (6 rows)
 
 delete from refd_table where length(b) = 3;
-NOTICE:  trigger = trig_table_delete_trig, old table = (2,"two a"), (2,"two b")
-NOTICE:  trigger = trig_table_delete_trig, old table = (11,"one a"), (11,"one b")
+NOTICE:  trigger = trig_table_delete_trig, old table = (2,"two a"), (2,"two b"), (11,"one a"), (11,"one b")
 select * from trig_table;
  a |    b    
 ---+---------
@@ -2309,6 +2333,30 @@ select * from trig_table;
 (2 rows)
 
 drop table refd_table, trig_table;
+--
+-- self-referential FKs are even more fun
+--
+create table self_ref (a int primary key,
+                       b int references self_ref(a) on delete cascade);
+create trigger self_ref_r_trig
+  after delete on self_ref referencing old table as old_table
+  for each row execute procedure dump_delete();
+create trigger self_ref_s_trig
+  after delete on self_ref referencing old table as old_table
+  for each statement execute procedure dump_delete();
+insert into self_ref values (1, null), (2, 1), (3, 2);
+delete from self_ref where a = 1;
+NOTICE:  trigger = self_ref_r_trig, old table = (1,), (2,1)
+NOTICE:  trigger = self_ref_r_trig, old table = (1,), (2,1)
+NOTICE:  trigger = self_ref_s_trig, old table = (1,), (2,1)
+NOTICE:  trigger = self_ref_r_trig, old table = (3,2)
+NOTICE:  trigger = self_ref_s_trig, old table = (3,2)
+-- without AR trigger, cascaded deletes all end up in one transition table
+drop trigger self_ref_r_trig on self_ref;
+insert into self_ref values (1, null), (2, 1), (3, 2), (4, 3);
+delete from self_ref where a = 1;
+NOTICE:  trigger = self_ref_s_trig, old table = (1,), (2,1), (3,2), (4,3)
+drop table self_ref;
 -- cleanup
 drop function dump_insert();
 drop function dump_update();
diff --git a/src/test/regress/sql/triggers.sql b/src/test/regress/sql/triggers.sql
index c6deb56c507..30bb7d17b08 100644
--- a/src/test/regress/sql/triggers.sql
+++ b/src/test/regress/sql/triggers.sql
@@ -1729,6 +1729,12 @@ create trigger table2_trig
 with wcte as (insert into table1 values (42))
   insert into table2 values ('hello world');
 
+with wcte as (insert into table1 values (43))
+  insert into table1 values (44);
+
+select * from table1;
+select * from table2;
+
 drop table table1;
 drop table table2;
 
@@ -1769,6 +1775,15 @@ create trigger my_table_multievent_trig
   after insert or update on my_table referencing new table as new_table
   for each statement execute procedure dump_insert();
 
+--
+-- Verify that you can't create a trigger with transition tables with
+-- a column list.
+--
+
+create trigger my_table_col_update_trig
+  after update of b on my_table referencing new table as new_table
+  for each statement execute procedure dump_insert();
+
 drop table my_table;
 
 --
@@ -1812,6 +1827,33 @@ select * from trig_table;
 
 drop table refd_table, trig_table;
 
+--
+-- self-referential FKs are even more fun
+--
+
+create table self_ref (a int primary key,
+                       b int references self_ref(a) on delete cascade);
+
+create trigger self_ref_r_trig
+  after delete on self_ref referencing old table as old_table
+  for each row execute procedure dump_delete();
+create trigger self_ref_s_trig
+  after delete on self_ref referencing old table as old_table
+  for each statement execute procedure dump_delete();
+
+insert into self_ref values (1, null), (2, 1), (3, 2);
+
+delete from self_ref where a = 1;
+
+-- without AR trigger, cascaded deletes all end up in one transition table
+drop trigger self_ref_r_trig on self_ref;
+
+insert into self_ref values (1, null), (2, 1), (3, 2), (4, 3);
+
+delete from self_ref where a = 1;
+
+drop table self_ref;
+
 -- cleanup
 drop function dump_insert();
 drop function dump_update();