diff --git a/gpAux/gpperfmon/src/gpmon/gpmon_agg.c b/gpAux/gpperfmon/src/gpmon/gpmon_agg.c index 83e0d2d5f7448a9390289dada9c8a50b90b7cfdd..232382150321643fd6629b5b2c7e62d6decd11aa 100644 --- a/gpAux/gpperfmon/src/gpmon/gpmon_agg.c +++ b/gpAux/gpperfmon/src/gpmon/gpmon_agg.c @@ -1391,6 +1391,7 @@ static int get_and_print_next_query_file_kvp(FILE* outfd, FILE* queryfd, char* q char *p = NULL; int field_len = 0; int retCode = 0; + bool replace_line_breaks = false; p = fgets(line, line_size, queryfd); line[line_size-1] = 0; // in case libc is buggy @@ -1421,6 +1422,10 @@ static int get_and_print_next_query_file_kvp(FILE* outfd, FILE* queryfd, char* q return APR_NOTFOUND; } + if (field_len >= MAX_GP_MAX_CSV_LINE_LENGTH - HARVEST_CSV_SAFEGUARD) { + replace_line_breaks = true; + } + fprintf(outfd, "\""); (*bytes_written)++; @@ -1437,6 +1442,19 @@ static int get_and_print_next_query_file_kvp(FILE* outfd, FILE* queryfd, char* q (*bytes_written)++; } + /** + * MPP-29418 COPY/External table have a limit for CSV length + * If it loads a row with line breaks and row length exceeds + * gp_max_csv_line_length, it will treat this CSV as bad format + * and fail the query. + * This workaround checks for line length and replace line breaks + * with space to prevent load failure. + * It may lead the long query statement slightly changed, but it + * is still better than fail to load or truncate the query text. + */ + if (replace_line_breaks && (*p == '\n' || *p == '\r')) + *p = ' '; + fputc(*p, outfd); (*bytes_written)++; diff --git a/gpAux/gpperfmon/src/gpmon/gpmondb.c b/gpAux/gpperfmon/src/gpmon/gpmondb.c index b8458da178a17ab6ced51bd7787d167585a84e32..b3230a6d276e861ddf6e7af4d693777a706be1cf 100644 --- a/gpAux/gpperfmon/src/gpmon/gpmondb.c +++ b/gpAux/gpperfmon/src/gpmon/gpmondb.c @@ -58,11 +58,8 @@ static const bool gpdb_exec_ddl(PGconn* conn, const char* ddl_query) return errmsg == NULL; } -// creates a connection and then runs the query -static const char* gpdb_exec(PGconn** pconn, PGresult** pres, const char* query) +static const char* gpdb_exec_with_connstr(PGconn** pconn, PGresult** pres, const char* query, const char* connstr) { - const char *connstr = "dbname='" GPMON_DB "' user='" GPMON_DBUSER - "' connect_timeout='30'"; PGconn *conn = NULL; conn = PQconnectdb(connstr); @@ -75,6 +72,26 @@ static const char* gpdb_exec(PGconn** pconn, PGresult** pres, const char* query) return gpdb_exec_only(conn, pres, query); } +// creates a connection and then runs the query +static const char* gpdb_exec(PGconn** pconn, PGresult** pres, const char* query) +{ + const char *connstr = "dbname='" GPMON_DB "' user='" GPMON_DBUSER + "' connect_timeout='30'"; + + return gpdb_exec_with_connstr(pconn, pres, query, connstr); +} + +static const char* gpdb_exec_harvest(PGconn** pconn, PGresult** pres, const char* query) +{ + const int CONNBUFSIZ = 256; + char connstr[CONNBUFSIZ]; + + snprintf(connstr, CONNBUFSIZ, "dbname='" GPMON_DB "' user='" GPMON_DBUSER + "' connect_timeout='30' options='-c gp_max_csv_line_length=%d'", MAX_GP_MAX_CSV_LINE_LENGTH); + + return gpdb_exec_with_connstr(pconn, pres, query, connstr); +} + // persistant_conn is optional if you are already holding an open connectionconn // return 1 if more than 0 rows are returned from query // return 0 if zero rows are returned from query @@ -920,7 +937,7 @@ static apr_status_t harvest(const char* tbl, apr_pool_t* pool, PGconn* conN) snprintf(qrybuf, QRYBUFSIZ, QRYFMT, tbl, tbl); - errmsg = gpdb_exec(&conn, &result, qrybuf); + errmsg = gpdb_exec_harvest(&conn, &result, qrybuf); if (errmsg) { res = 1; diff --git a/gpAux/gpperfmon/src/gpmon/gpmondb.h b/gpAux/gpperfmon/src/gpmon/gpmondb.h index 76eecd965ba8b0a0e213c7f7738f20c78cf8ca79..59e4f2d92c7a050686c35d4270c5927488b14dd0 100644 --- a/gpAux/gpperfmon/src/gpmon/gpmondb.h +++ b/gpAux/gpperfmon/src/gpmon/gpmondb.h @@ -4,6 +4,7 @@ #include "apr_general.h" #include "apr_md5.h" #include "apr_hash.h" +#include "cdb/cdbcsv.h" /** * Validate the the gpperfmon database is correct and @@ -97,5 +98,12 @@ void process_line_in_hadoop_cluster_info(apr_pool_t*, apr_hash_t*, char*, char*, int get_hadoop_hosts_and_add_to_hosts(apr_pool_t*, apr_hash_t*, mmon_options_t*); apr_status_t truncate_file(char*, apr_pool_t*); +/** + * MPP-29418 workaround copy/external issue that csv line with like breaks cannot + * exceed gp_max_csv_line_length, here we substruct 1K for fixed length columns for + * data load safety + */ +#define HARVEST_CSV_SAFEGUARD (1024) + #endif /* GPMONDB_H */ diff --git a/src/backend/utils/misc/guc_gp.c b/src/backend/utils/misc/guc_gp.c index 276b54fe2caad2c84b4f08c9f60c925bed799731..2e84cf68fda78a840e8d91382ac019012e764f2f 100644 --- a/src/backend/utils/misc/guc_gp.c +++ b/src/backend/utils/misc/guc_gp.c @@ -21,6 +21,7 @@ #include "access/url.h" #include "access/xlog_internal.h" #include "cdb/cdbappendonlyam.h" +#include "cdb/cdbcsv.h" #include "cdb/cdbdisp.h" #include "cdb/cdbfilerep.h" #include "cdb/cdbsreh.h" @@ -3642,7 +3643,7 @@ struct config_int ConfigureNamesInt_gp[] = GUC_GPDB_ADDOPT }, &gp_max_csv_line_length, - 1 * 1024 * 1024, 32 * 1024, 4 * 1024 * 1024, NULL, NULL + 1 * 1024 * 1024, 32 * 1024, MAX_GP_MAX_CSV_LINE_LENGTH, NULL, NULL }, /* diff --git a/src/include/cdb/cdbcsv.h b/src/include/cdb/cdbcsv.h new file mode 100644 index 0000000000000000000000000000000000000000..a3f79ce97c17ca8fb337a04b7cbd7bb44ef6b0e0 --- /dev/null +++ b/src/include/cdb/cdbcsv.h @@ -0,0 +1,21 @@ +/*------------------------------------------------------------------------- + * + * cdbcsv.h + * Header file for copy/external table process csv file format + * + * Portions Copyright (c) 2005-2008, Greenplum inc + * Portions Copyright (c) 2012-Present Pivotal Software, Inc. + * + * + * IDENTIFICATION + * src/include/cdb/cdbcsv.h + * + *------------------------------------------------------------------------- + */ + +#ifndef CDBCSV_H +#define CDBCSV_H + +#define MAX_GP_MAX_CSV_LINE_LENGTH (4 * 1024 * 1024) + +#endif /* CDBCSV_H */