Check for data integration on expand_after_icw pipeline.

47687ca0 · Ning Yu · d824012e · 47687ca0 · 47687ca0 · 47687ca0
3 changed file
--- a/concourse/scripts/common.bash
+++ b/concourse/scripts/common.bash
@@ -78,21 +78,49 @@ EOF
  echo "$pgoptions"
 }

+# detect for partial tables from all the non-template databases,
+# exit code is 0 if no partial table is found, or 1 otherwise
+function list_partial_tables() {
+  local pgoptions="$(get_pgoptions)"
+
+  su gpadmin -c bash <<EOF
+. /usr/local/greenplum-db-devel/greenplum_path.sh
+export PGOPTIONS='$pgoptions'
+python $CWDIR/scan_partial_table.py
+EOF
+}
+
+# usage: sort_dump < input_file > output_file
+#
+# filter and sort the 'INSERT INTO' lines of "pg_dumpall --inserts" output.
+# will also append the database name to end of each line as comment.
+function sort_dump() {
+  sed -nrf "$CWDIR/filter_dump.sed" | sort
+}
+
 # usage: expand_cluster <old_size> <new_size>
 function expand_cluster() {
  local old="$1"
  local new="$2"
  local inputfile="/tmp/inputfile.${old}-${new}"
  local pidfile="/tmp/postmaster.pid.${old}-${new}"
+  local dump_before="/tmp/dump.${old}-${new}.before.sql"
+  local dump_after="/tmp/dump.${old}-${new}.after.sql"
+  local sorted_dump_before="/tmp/sorted-dump.${old}-${new}.before.sql"
+  local sorted_dump_after="/tmp/sorted-dump.${old}-${new}.after.sql"
+  local sorted_dump_diff="/tmp/sorted-dump.${old}-${new}.diff"
  local dbname="postgres"
  local pgoptions="$(get_pgoptions)"
+  local retval=0
  local uncompleted
-  local partial

  pushd gpdb_src/gpAux/gpdemo

  gen_gpexpand_input "$old" "$new"

+  # dump before expansion
+  su gpadmin -c "pg_dumpall --inserts -Oxaf '$dump_before'"
+
  # Backup master pid, by checking it later we can know whether the cluster is
  # restarted during the tests.
  su gpadmin -c "head -n 1 $MASTER_DATA_DIRECTORY/postmaster.pid >$pidfile"
@@ -104,23 +132,40 @@ function expand_cluster() {
  uncompleted=$(su gpadmin -c "psql -Aqtd $dbname -c \"select count(*) from gpexpand.status_detail where status <> 'COMPLETED'\"")
  # cleanup
  su gpadmin -c "yes | PGOPTIONS='$pgoptions' gpexpand -s -c"
+  su gpadmin -c "dropdb $dbname" 2>/dev/null || : # ignore failure
+
+  # dump after expansion
+  su gpadmin -c "pg_dumpall --inserts -Oxaf '$dump_after'"

  popd

  if [ "$uncompleted" -ne 0 ]; then
-	  echo "error: some tables are not successfully expanded"
-	  return 1
+    echo "error: fail to expand some tables"
+    retval=1
+  fi
+
+  # double check gp_distribution_policy.numsegments in every database
+  if ! list_partial_tables; then
+    echo "error: some tables are not expanded"
+    retval=1
+  fi
+
+  echo "checking for data integration after expansion..."
+  sort_dump < "$dump_before" > "$sorted_dump_before"
+  sort_dump < "$dump_after" > "$sorted_dump_after"
+  if diff -u0 "$sorted_dump_before" "$sorted_dump_after" >"$sorted_dump_diff"; then
+    echo "before and after dumps have no difference"
+  else
+    echo "error: before and after dumps differ, here are part of the sorted diff:"
+    head -n50 "$sorted_dump_diff"
+    retval=1
  fi

-  # double check gp_distribution_policy.numsegments
-  partial=$(su gpadmin -c "psql -Aqtd $dbname -c \"select count(*) from gp_distribution_policy where numsegments <> $new\"")
-  if [ "$partial" -ne 0 ]; then
-	  echo "error: not all the tables are expanded by gpexpand"
-	  return 1
+  if [ "$retval" -eq 0 ]; then
+    echo "all the tables are successfully expanded"
  fi

-  echo "all the tables are successfully expanded"
-  return 0
+  return $retval
 }

 # usage: make_cluster [<demo_cluster_options>]

--- a/concourse/scripts/filter_dump.sed
+++ b/concourse/scripts/filter_dump.sed
+# foreach database name
+\@^\\connect (.*)$@{
+  # adjust its format
+  s@@ /* DATABASE: \1 */@;
+  # copy it to hold space
+  h;
+}
+
+# foreach insert command
+\@^INSERT INTO @{
+  # append the database name from hold space
+  G;
+  # join the two lines
+  s@\n@@;
+  # output it
+  p;
+}
--- a/concourse/scripts/scan_partial_table.py
+++ b/concourse/scripts/scan_partial_table.py
+#!/usr/bin/env python
+
+import sys
+from gppylib.db import dbconn
+
+list_dbs_sql = '''
+    select datname from pg_database
+     where datallowconn and not datistemplate
+'''
+
+get_cluster_size_sql = '''
+    select numsegments from gp_toolkit.__gp_number_of_segments
+'''
+
+scan_sql = '''
+    select n.nspname, c.relname
+      from gp_distribution_policy d
+      join pg_class c on c.oid = d.localoid
+      join pg_namespace n on n.oid = c.relnamespace
+     where d.numsegments <> {cluster_size:d}
+       and c.relstorage <> 'x'
+'''
+
+dburl = dbconn.DbURL()
+conn = dbconn.connect(dburl)
+
+cursor = dbconn.execSQL(conn, list_dbs_sql)
+dbnames = [row[0] for row in cursor]
+cursor.close()
+
+cluster_size = int(dbconn.execSQLForSingleton(conn, get_cluster_size_sql))
+
+conn.close()
+
+print('scanning for partial tables...')
+retval = 0
+for dbname in dbnames:
+    dburl = dbconn.DbURL(dbname=dbname)
+    conn = dbconn.connect(dburl)
+
+    cursor = dbconn.execSQL(conn, scan_sql.format(cluster_size=cluster_size))
+    if cursor.rowcount > 0:
+        retval = 1
+
+    for row in cursor:
+        print('- "{dbname}"."{namespace}"."{relname}"'.format(
+            dbname=dbname.replace('"', '""'),
+            namespace=row[0].replace('"', '""'),
+            relname=row[1].replace('"', '""')))
+
+    cursor.close()
+    conn.close()
+
+sys.exit(retval)