From 9a9cd48bc7df94e91cd1c9340b261df5e9bdad9f Mon Sep 17 00:00:00 2001 From: Pengzhou Tang Date: Mon, 7 Aug 2017 20:34:02 -0400 Subject: [PATCH] Add debug info for interconnect network timeout It was very difficult to verify if interconnect is stucked in resending phase or if there is udp resending latency within interconnect. To improve it, this commit record a debug message every Gp_interconnect_debug_retry_interval times when gp_log_interconnect is set to DEBUG. --- src/backend/cdb/cdbvars.c | 1 + src/backend/cdb/motion/ic_udpifc.c | 8 ++++++++ src/backend/utils/misc/guc_gp.c | 10 ++++++++++ src/include/cdb/cdbvars.h | 1 + 4 files changed, 20 insertions(+) diff --git a/src/backend/cdb/cdbvars.c b/src/backend/cdb/cdbvars.c index b1a4fb9ae7..7608e1b564 100644 --- a/src/backend/cdb/cdbvars.c +++ b/src/backend/cdb/cdbvars.c @@ -198,6 +198,7 @@ int Gp_interconnect_min_rto = 20; int Gp_interconnect_fc_method = INTERCONNECT_FC_METHOD_LOSS; int Gp_interconnect_transmit_timeout = 3600; int Gp_interconnect_min_retries_before_timeout = 100; +int Gp_interconnect_debug_retry_interval= 10; int Gp_interconnect_hash_multiplier = 2; /* sets the size of the * hash table used by diff --git a/src/backend/cdb/motion/ic_udpifc.c b/src/backend/cdb/motion/ic_udpifc.c index fbad9afd3b..4a8a6ee058 100644 --- a/src/backend/cdb/motion/ic_udpifc.c +++ b/src/backend/cdb/motion/ic_udpifc.c @@ -4899,6 +4899,14 @@ checkNetworkTimeout(ICBuffer *buf, uint64 now) * by OS for a long time. In this case, only a few times are tried. * Thus, the GUC Gp_interconnect_min_retries_before_timeout is added here. */ + if (gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG && + buf->nRetry % Gp_interconnect_debug_retry_interval == 0) + { + ereport(LOG, (errmsg("resending packet (seq %d) to %s (pid %d cid %d) with %d retries in %lu seconds", + buf->pkt->seq, buf->conn->remoteHostAndPort, buf->pkt->dstPid, + buf->pkt->dstContentId, buf->nRetry, (now - buf->sentTime) / 1000 / 1000 ))); + } + if ((buf->nRetry > Gp_interconnect_min_retries_before_timeout) && (now - buf->sentTime) > ((uint64)Gp_interconnect_transmit_timeout * 1000 * 1000)) { ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), diff --git a/src/backend/utils/misc/guc_gp.c b/src/backend/utils/misc/guc_gp.c index 87dcf7a60c..7f93bb0958 100644 --- a/src/backend/utils/misc/guc_gp.c +++ b/src/backend/utils/misc/guc_gp.c @@ -3831,6 +3831,16 @@ struct config_int ConfigureNamesInt_gp[] = 100, 1, 4096, NULL, NULL }, + { + {"gp_interconnect_debug_retry_interval", PGC_USERSET, GP_ARRAY_TUNING, + gettext_noop("Sets the interval by retry times to record a debug message for retry."), + NULL, + GUC_GPDB_ADDOPT + }, + &Gp_interconnect_debug_retry_interval, + 10, 1, 4096, NULL, NULL + }, + { {"gp_udp_bufsize_k", PGC_BACKEND, GP_ARRAY_TUNING, gettext_noop("Sets recv buf size of UDP interconnect, for testing."), diff --git a/src/include/cdb/cdbvars.h b/src/include/cdb/cdbvars.h index 3b8a0a3613..096740d3ea 100644 --- a/src/include/cdb/cdbvars.h +++ b/src/include/cdb/cdbvars.h @@ -439,6 +439,7 @@ extern int Gp_interconnect_default_rtt; extern int Gp_interconnect_min_rto; extern int Gp_interconnect_transmit_timeout; extern int Gp_interconnect_min_retries_before_timeout; +extern int Gp_interconnect_debug_retry_interval; /* UDP recv buf size in KB. For testing */ extern int Gp_udp_bufsize_k; -- GitLab