From 58b61bd3a2e247d4d728940f4f9ded666d29c023 Mon Sep 17 00:00:00 2001
From: Mel Kiyama <mkiyama@users.noreply.github.com>
Date: Fri, 15 Sep 2017 13:53:31 -0700
Subject: [PATCH] docs: COPY command - add PROGRAM clause (#3297)

* docs: COPY command add PROGRAM clause

* docs: copy - edits from review comments.
---
 .../dita/ref_guide/config_params/guc-list.xml | 22 +++++--
 gpdb-doc/dita/ref_guide/sql_commands/COPY.xml | 64 ++++++++++++++-----
 .../utility_guide/client_utilities/psql.xml   | 12 ++--
 3 files changed, 71 insertions(+), 27 deletions(-)
diff --git a/gpdb-doc/dita/ref_guide/config_params/guc-list.xml b/gpdb-doc/dita/ref_guide/config_params/guc-list.xml
index b838fe17b0..f8aa1433b5 100644
--- a/gpdb-doc/dita/ref_guide/config_params/guc-list.xml
+++ b/gpdb-doc/dita/ref_guide/config_params/guc-list.xml
@@ -1166,8 +1166,7 @@
           <tbody>
             <row>
               <entry colname="col1">DEBUG5
-                  <p>DEBUG4</p><p>DEBUG3</p><p>DEBUG2</p><p>DEBUG1</p><p>LOG
-                  NOTICE</p><p>WARNING</p><p>ERROR</p><p>FATAL</p><p>PANIC</p></entry>
+                  <p>DEBUG4</p><p>DEBUG3</p><p>DEBUG2</p><p>DEBUG1</p><p>LOG</p><p>NOTICE</p><p>WARNING</p><p>ERROR</p><p>FATAL</p><p>PANIC</p></entry>
               <entry colname="col2">NOTICE</entry>
               <entry colname="col3">master<p>session</p><p>reload</p></entry>
             </row>
@@ -3616,7 +3615,8 @@
         </ul></p>
       <p>If the value is <codeph>false</codeph>, the distribution policy is not checked. The data
         added to the table might violate the table distribution policy for the segment instance.
-        Manual redistribution of table data might be required. </p>
+        Manual redistribution of table data might be required. See the <codeph>ALTER TABLE</codeph>
+        clause <codeph>WITH REORGANIZE</codeph>.</p>
       <p>The parameter can be set for a database system or a session. The parameter cannot be set
         for a specific database.</p>
       <table id="table_gxf_hpm_z1b">
@@ -4878,7 +4878,9 @@
   <topic id="gp_resource_group_cpu_limit">
     <title>gp_resource_group_cpu_limit</title>
     <body>
-      <note type="warning">Resource group-based workload management is an experimental feature and is not intended for use in a production environment. Experimental features are subject to change without notice in future releases.</note>
+      <note type="warning">Resource group-based workload management is an experimental feature and
+        is not intended for use in a production environment. Experimental features are subject to
+        change without notice in future releases.</note>
       <p>Identifies the maximum percentage of system CPU resources to allocate to resource groups on
         each Greenplum Database segment node.</p>
       <table id="gp_resource_group_cpu_limit_table">
@@ -4907,7 +4909,9 @@
   <topic id="gp_resource_group_memory_limit">
     <title>gp_resource_group_memory_limit</title>
     <body>
-     <note type="warning">Resource group-based workload management is an experimental feature and is not intended for use in a production environment. Experimental features are subject to change without notice in future releases.</note>
+      <note type="warning">Resource group-based workload management is an experimental feature and
+        is not intended for use in a production environment. Experimental features are subject to
+        change without notice in future releases.</note>
       <p>Identifies the maximum percentage of system memory resources to allocate to resource groups
         on each Greenplum Database segment node.</p>
       <table id="gp_resource_group_memory_limit_table">
@@ -4936,7 +4940,9 @@
   <topic id="gp_resource_manager">
     <title>gp_resource_manager</title>
     <body>
-      <note type="warning">Resource group-based workload management is an experimental feature and is not intended for use in a production environment. Experimental features are subject to change without notice in future releases.</note>
+      <note type="warning">Resource group-based workload management is an experimental feature and
+        is not intended for use in a production environment. Experimental features are subject to
+        change without notice in future releases.</note>
       <p>Identifies the resource management scheme currently enabled in the Greenplum Database
         cluster. The default scheme is workload management using resource queues.</p>
       <table id="gp_resource_manager_table">
@@ -7213,7 +7219,9 @@
   <topic id="max_resource_groups">
     <title>max_resource_groups</title>
     <body>
-      <note type="warning">Resource group-based workload management is an experimental feature and is not intended for use in a production environment. Experimental features are subject to change without notice in future releases.</note>
+      <note type="warning">Resource group-based workload management is an experimental feature and
+        is not intended for use in a production environment. Experimental features are subject to
+        change without notice in future releases.</note>
       <p>Sets the maximum number of resource groups that you can create in a Greenplum Database
         system. Resource groups are defined system-wide.</p>
       <table id="max_resource_queues_table">
diff --git a/gpdb-doc/dita/ref_guide/sql_commands/COPY.xml b/gpdb-doc/dita/ref_guide/sql_commands/COPY.xml
index db3d097797..9e0b2f2f86 100644
--- a/gpdb-doc/dita/ref_guide/sql_commands/COPY.xml
+++ b/gpdb-doc/dita/ref_guide/sql_commands/COPY.xml
@@ -7,7 +7,7 @@
     <p id="sql_command_desc">Copies data between a file and a table.</p>
     <section id="section2">
       <title>Synopsis</title>
-      <codeblock id="sql_command_synopsis">COPY <varname>table</varname> [(<varname>column</varname> [, ...])] FROM {'<varname>file</varname>' | STDIN}
+      <codeblock id="sql_command_synopsis">COPY <varname>table</varname> [(<varname>column</varname> [, ...])] FROM {'<varname>file</varname>' | PROGRAM '<varname>command</varname>' | STDIN}
      [ [WITH]  
        [ON SEGMENT]
        [BINARY]
@@ -23,7 +23,7 @@
        [[LOG ERRORS]  
        SEGMENT REJECT LIMIT <varname>count</varname> [ROWS | PERCENT] ]
 
-COPY {table [(<varname>column</varname> [, ...])] | (<varname>query</varname>)} TO {'<varname>file</varname>' | STDOUT}
+COPY {table [(<varname>column</varname> [, ...])] | (<varname>query</varname>)} TO {'<varname>file</varname>' | PROGRAM '<varname>command</varname>' | STDOUT}
       [ [WITH] 
         [ON SEGMENT]
         [BINARY]
@@ -123,6 +123,26 @@ COPY {table [(<varname>column</varname> [, ...])] | (<varname>query</varname>)}
           </pt>
           <pd>The absolute path name of the input or output file.</pd>
         </plentry>
+        <plentry>
+          <pt>PROGRAM '<varname>command</varname>'</pt>
+          <pd>Specify a command to execute. The <varname>command</varname> must be specified from
+            the viewpoint of the Greenplum Database master host system, and must be executable by
+            the Greenplum Database administrator user (<codeph>gpadmin</codeph>). The <codeph>COPY
+              FROM</codeph> command reads the input from the standard output of the command, and for
+            the <codeph>COPY TO</codeph> command, the output is written to the standard input of the
+            command.</pd>
+          <pd>The <varname>command</varname> is invoked by a shell. When passing arguments to the
+            shell, strip or escape any special characters that have a special meaning for the shell.
+            For security reasons, it is best to use a fixed command string, or at least avoid
+            passing any user input in the string. </pd>
+          <pd>When <codeph>ON SEGMENT</codeph> is specified, the command must be executable on all
+            Greenplum Database primary segment hosts by the Greenplum Database administrator user
+              (<codeph>gpadmin</codeph>). The command is executed by each Greenplum segment
+            instance. The <codeph>&lt;SEGID></codeph> is required in the
+            <varname>command</varname>.</pd>
+          <pd>See the <codeph>ON SEGMENT</codeph> clause for information about command syntax
+            requirements and he data that is copied when the clause is specified. </pd>
+        </plentry>
         <plentry>
           <pt>STDIN</pt>
           <pd>Specifies that input comes from the client application. The <codeph>ON
@@ -170,6 +190,10 @@ COPY {table [(<varname>column</varname> [, ...])] | (<varname>query</varname>)}
                   instance. </pd>
               </plentry>
             </parml></pd>
+          <pd>When the <codeph>PROGRAM <varname>command</varname></codeph> clause is specified, the
+              <codeph>&lt;SEGID></codeph> string literal is required in the
+              <varname>command</varname>, the <codeph>&lt;SEG_DATA_DIR></codeph> string literal is
+            optional. See <xref href="#topic1/section11" format="dita">Examples</xref>.</pd>
           <pd>For a <codeph>COPY FROM...ON SEGMENT</codeph> command, the table distribution policy
             is checked when data is copied into the table. By default, an error is returned if a
             data row violates the table distribution policy. You can disable the distribution policy
@@ -378,7 +402,8 @@ COPY {table [(<varname>column</varname> [, ...])] | (<varname>query</varname>)}
           <codeph>COPY FROM...ON SEGMENT</codeph> was run.</p>
       <note>If you run <codeph>COPY FROM...ON SEGMENT</codeph>and the server configuration parameter
           <codeph>gp_enable_segment_copy_checking</codeph> is <codeph>false</codeph>, manual
-        redistribution of table data might be required.</note>
+        redistribution of table data might be required. See the <codeph>ALTER TABLE</codeph> clause
+          <codeph>WITH REORGANIZE</codeph>.</note>
       <p>When you specify the <codeph>LOG ERRORS</codeph> clause, Greenplum Database captures errors
         that occur while reading the external table data. You can view and manage the captured error
         log data. </p>
@@ -583,12 +608,9 @@ COPY {table [(<varname>column</varname> [, ...])] | (<varname>query</varname>)}
         isolation mode and log errors:</p>
       <codeblock>COPY sales FROM '/home/usr1/sql/sales_data' LOG ERRORS 
    SEGMENT REJECT LIMIT 10 ROWS;</codeblock>
-      <p>To copy segment data for later use, use the <codeph>ON SEGMENT</codeph> argument. Use of
-        the <codeph>COPY TO ON SEGMENT</codeph> argument takes the form:</p>
-      <p><codeph>COPY</codeph>
-        <varname>table</varname> TO
-          '&lt;SEG_DATA_DIR>/<varname>gpdumpname</varname>&lt;SEGID>_<varname>suffix</varname>' ON
-        SEGMENT; </p>
+      <p>To copy segment data for later use, use the <codeph>ON SEGMENT</codeph> clause. Use of the
+          <codeph>COPY TO ON SEGMENT</codeph> command takes the form:</p>
+      <codeblock>COPY <varname>table</varname> TO '&lt;SEG_DATA_DIR>/<varname>gpdumpname</varname>&lt;SEGID>_<varname>suffix</varname>' ON SEGMENT; </codeblock>
       <p>The <codeph>&lt;SEGID></codeph> is required. However, you can substitute an absolute path
         for the <codeph>&lt;SEG_DATA_DIR></codeph> string literal in the path. </p>
       <p>When you pass in the string literal <codeph>&lt;SEG_DATA_DIR></codeph> and
@@ -597,14 +619,10 @@ COPY {table [(<varname>column</varname> [, ...])] | (<varname>query</varname>)}
       <p>For example, if you have <codeph>mytable</codeph> with the segments and mirror segments
         like
         this:<codeblock>contentid | dbid | file segment location 
-    0     |  1   |/home/usr1/data1/gpsegdir0
-
+    0     |  1   | /home/usr1/data1/gpsegdir0
     0     |  3   | /home/usr1/data_mirror1/gpsegdir0 
-
     1     |  4   | /home/usr1/data2/gpsegdir1
-
-    1     |  2   | /home/usr1/data_mirror2/gpsegdir1 
-</codeblock>running
+    1     |  2   | /home/usr1/data_mirror2/gpsegdir1 </codeblock>running
         the
         command:<codeblock>COPY mytable TO '&lt;SEG_DATA_DIR>/gpbackup&lt;SEGID>.txt' ON SEGMENT;</codeblock>
         would result in the following
@@ -624,6 +642,22 @@ COPY {table [(<varname>column</varname> [, ...])] | (<varname>query</varname>)}
           necessary.<note>Tools such as <codeph>gpfdist</codeph> can be used to restore data. The
           backup/restore tools will not work with files that were manually generated with
             <codeph>COPY TO ON SEGMENT</codeph>. </note></p>
+      <p>This example copies the data from the <codeph>lineitem</codeph> table and uses the
+          <codeph>PROGRAM</codeph> clause to add the data to the
+          <codeph>/tmp/lineitem_program.csv</codeph> file with <codeph>cat</codeph> utility. The
+        file is placed on the Greenplum Database
+        master.<codeblock>COPY LINEITEM TO PROGRAM 'cat > /tmp/lineitem.csv' CSV; </codeblock></p>
+      <p>This example uses the <codeph>PROGRAM</codeph> and <codeph>ON SEGEMENT</codeph> clauses to
+        copy data to files on the segment hosts. On the segment hosts, the <codeph>COPY</codeph>
+        command replaces <codeph>&lt;SEGID></codeph> with the segment content ID to create a file
+        for each segment instance on the segment
+        host.<codeblock>COPY LINEITEM TO PROGRAM 'cat > /tmp/lineitem_program&lt;SEGID>.csv' ON SEGMENT CSV; </codeblock></p>
+      <p>This example uses the <codeph>PROGRAM</codeph> and <codeph>ON SEGEMENT</codeph> clauses to
+        copy data from files on the segment hosts. The <codeph>COPY</codeph> command replaces
+          <codeph>&lt;SEGID></codeph> with the segment content ID when copying data from the files.
+        On the segment hosts, there must be a file for each segment instance where the file name
+        contains the segment content ID on the segment host.
+        <codeblock>COPY LINEITEM_4 FROM PROGRAM 'cat /tmp/lineitem_program&lt;SEGID>.csv' ON SEGMENT CSV;</codeblock></p>
     </section>
     <section id="section12">
       <title>Compatibility</title>
diff --git a/gpdb-doc/dita/utility_guide/client_utilities/psql.xml b/gpdb-doc/dita/utility_guide/client_utilities/psql.xml
index 85323fd597..db0bcdcf28 100644
--- a/gpdb-doc/dita/utility_guide/client_utilities/psql.xml
+++ b/gpdb-doc/dita/utility_guide/client_utilities/psql.xml
@@ -336,9 +336,9 @@ testdb=#</codeblock>
         </plentry>
         <plentry>
           <pt>\copy {<varname>table</varname> [(<varname>column_list</varname>)] |
-              (<varname>query</varname>)} {from | to} {<varname>filename</varname> | stdin | stdout
-            | pstdin | pstdout} [with] [binary] [oids] [delimiter [as]
-            '<varname>character</varname>'] [null [as] '<varname>string</varname>'] [csv [header]
+              (<varname>query</varname>)} {from | to} {'<varname>filename</varname>' | stdin |
+            stdout | pstdin | pstdout} [with] [binary] [oids] [delimiter [as]
+              '<varname>character</varname>'] [null [as] '<varname>string</varname>'] [csv [header]
             [quote [as] 'character'] [escape [as] '<varname>character</varname>'] [force quote
             column_list] [force not null column_list]]</pt>
           <pd>Performs a frontend (client) copy. This is an operation that runs an SQL
@@ -497,8 +497,10 @@ testdb=#</codeblock>
           <pd>Lists all database roles, or only those that match pattern.</pd>
         </plentry>
         <plentry>
-          <pt>\dx [<varname>extension_pattern</varname>] | \dx+ [<varname>extension_pattern</varname>]</pt>
-          <pd>Lists all installed extensions, or only those that match the pattern. <codeph>\dx</codeph> and <codeph>\dx+</codeph> are functionally equivalent.</pd>
+          <pt>\dx [<varname>extension_pattern</varname>] | \dx+
+              [<varname>extension_pattern</varname>]</pt>
+          <pd>Lists all installed extensions, or only those that match the pattern.
+              <codeph>\dx</codeph> and <codeph>\dx+</codeph> are functionally equivalent.</pd>
         </plentry>
         <plentry>
           <pt>\e | \edit [<varname>filename</varname>]</pt>
-- 
GitLab