Merge pull request #22300 from ClickHouse/trying_parallel_func_tests

Trying parallel func tests

Merge pull request #22300 from ClickHouse/trying_parallel_func_tests
Trying parallel func tests
e7df8893 · Alexander Kuzmenkov · GitHub · 93bd2c3b · 414d6fb2 · e7df8893
9 changed file
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@@ -74,12 +74,17 @@ function run_tests()
        ADDITIONAL_OPTIONS+=('--order=random')
        ADDITIONAL_OPTIONS+=('--skip')
        ADDITIONAL_OPTIONS+=('00000_no_tests_to_skip')
-        ADDITIONAL_OPTIONS+=('--jobs')
-        ADDITIONAL_OPTIONS+=('4')
+        # Note that flaky check must be ran in parallel, but for now we run
+        # everything in parallel except DatabaseReplicated. See below.
    fi

    if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
        ADDITIONAL_OPTIONS+=('--replicated-database')
+    else
+        # Too many tests fail for DatabaseReplicated in parallel. All other
+        # configurations are OK.
+        ADDITIONAL_OPTIONS+=('--jobs')
+        ADDITIONAL_OPTIONS+=('8')
    fi

    clickhouse-test --testname --shard --zookeeper --hung-check --print-time \

--- a/src/Storages/System/StorageSystemClusters.cpp
+++ b/src/Storages/System/StorageSystemClusters.cpp
@@ -38,7 +38,21 @@ void StorageSystemClusters::fillData(MutableColumns & res_columns, const Context
    for (const auto & name_and_database : databases)
    {
        if (const auto * replicated = typeid_cast<const DatabaseReplicated *>(name_and_database.second.get()))
-            writeCluster(res_columns, {name_and_database.first, replicated->getCluster()});
+        {
+            // A quick fix for stateless tests with DatabaseReplicated. Its ZK
+            // node can be destroyed at any time. If another test lists
+            // system.clusters to get client command line suggestions, it will
+            // get an error when trying to get the info about DB from ZK.
+            // Just ignore these inaccessible databases. A good example of a
+            // failing test is `01526_client_start_and_exit`.
+            try {
+                writeCluster(res_columns, {name_and_database.first, replicated->getCluster()});
+            }
+            catch (...)
+            {
+                tryLogCurrentException(__PRETTY_FUNCTION__);
+            }
+        }
    }
 }


--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -305,6 +305,9 @@ def run_tests_array(all_tests_with_params):
    failures_total = 0
    failures = 0
    failures_chain = 0
+    start_time = datetime.now()
+
+    is_concurrent = multiprocessing.current_process().name != "MainProcess"

    client_options = get_additional_client_options(args)

@@ -315,7 +318,7 @@ def run_tests_array(all_tests_with_params):
            return ''

    if all_tests:
-        print("\nRunning {} {} tests.".format(len(all_tests), suite) + "\n")
+        print(f"\nRunning {len(all_tests)} {suite} tests ({multiprocessing.current_process().name}).\n")

    for case in all_tests:
        if SERVER_DIED:
@@ -330,7 +333,6 @@ def run_tests_array(all_tests_with_params):

        try:
            status = ''
-            is_concurrent = multiprocessing.current_process().name != "MainProcess"
            if not is_concurrent:
                sys.stdout.flush()
                sys.stdout.write("{0:72}".format(name + ": "))
@@ -499,12 +501,18 @@ def run_tests_array(all_tests_with_params):
    failures_total = failures_total + failures

    if failures_total > 0:
-        print(colored("\nHaving {failures_total} errors! {passed_total} tests passed. {skipped_total} tests skipped.".format(
-            passed_total = passed_total, skipped_total = skipped_total, failures_total = failures_total), args, "red", attrs=["bold"]))
+        print(colored(f"\nHaving {failures_total} errors! {passed_total} tests passed."
+            f" {skipped_total} tests skipped. {(datetime.now() - start_time).total_seconds():.2f} s elapsed"
+            f' ({multiprocessing.current_process().name}).',
+            args, "red", attrs=["bold"]))
        exit_code = 1
    else:
-        print(colored("\n{passed_total} tests passed. {skipped_total} tests skipped.".format(
-            passed_total = passed_total, skipped_total = skipped_total), args, "green", attrs=["bold"]))
+        print(colored(f"\n{passed_total} tests passed. {skipped_total} tests skipped."
+            f" {(datetime.now() - start_time).total_seconds():.2f} s elapsed"
+            f' ({multiprocessing.current_process().name}).',
+            args, "green", attrs=["bold"]))
+
+    sys.stdout.flush()


 server_logs_level = "warning"
@@ -799,7 +807,8 @@ def main(args):
                if jobs > run_total:
                    run_total = jobs

-                batch_size = len(parallel_tests) // jobs
+                # Create two batches per process for more uniform execution time.
+                batch_size = max(1, len(parallel_tests) // (jobs * 2))
                parallel_tests_array = []
                for i in range(0, len(parallel_tests), batch_size):
                    parallel_tests_array.append((parallel_tests[i:i+batch_size], suite, suite_dir, suite_tmp_dir))

--- a/tests/queries/0_stateless/00926_adaptive_index_granularity_versioned_collapsing_merge_tree.reference
+++ b/tests/queries/0_stateless/00926_adaptive_index_granularity_versioned_collapsing_merge_tree.reference
@@ -6,11 +6,9 @@
 4
 1
 0
-0
 6
 2
 -----
 6
 3
 0
-0
--- a/tests/queries/0_stateless/00926_adaptive_index_granularity_versioned_collapsing_merge_tree.sql
+++ b/tests/queries/0_stateless/00926_adaptive_index_granularity_versioned_collapsing_merge_tree.sql
@@ -62,7 +62,11 @@ OPTIMIZE TABLE four_rows_per_granule FINAL;

 SELECT COUNT(*) FROM four_rows_per_granule;

-SELECT distinct(marks) from system.parts WHERE table = 'four_rows_per_granule' and database=currentDatabase() and active=1;
+-- We expect zero marks here, so we might get zero rows if all the parts were
+-- deleted already. This can happen in parallel runs where there may be a long delay
+-- between queries. So we must write the query in such a way that it always returns
+-- zero rows if OK.
+SELECT distinct(marks) d from system.parts WHERE table = 'four_rows_per_granule' and database=currentDatabase() and active=1 having d > 0;

 INSERT INTO four_rows_per_granule (p, k, v1, v2, Sign, Version) VALUES ('2018-05-15', 1, 1000, 2000, 1, 1), ('2018-05-16', 2, 3000, 4000, 1, 1), ('2018-05-17', 3, 5000, 6000, 1, 1), ('2018-05-18', 4, 7000, 8000, 1, 1);

@@ -120,6 +124,10 @@ OPTIMIZE TABLE six_rows_per_granule FINAL;

 SELECT COUNT(*) FROM six_rows_per_granule;

-SELECT distinct(marks) from system.parts WHERE table = 'six_rows_per_granule' and database=currentDatabase() and active=1;
+-- We expect zero marks here, so we might get zero rows if all the parts were
+-- deleted already. This can happen in parallel runs where there may be a long delay
+-- between queries. So we must write the query in such a way that it always returns
+-- zero rows if OK.
+SELECT distinct(marks) d from system.parts WHERE table = 'six_rows_per_granule' and database=currentDatabase() and active=1 having d > 0;

 DROP TABLE IF EXISTS six_rows_per_granule;
--- a/tests/queries/0_stateless/00976_system_stop_ttl_merges.sql
+++ b/tests/queries/0_stateless/00976_system_stop_ttl_merges.sql
@@ -2,7 +2,7 @@ drop table if exists ttl;

 create table ttl (d Date, a Int) engine = MergeTree order by a partition by toDayOfMonth(d) ttl d + interval 1 day;

-system stop ttl merges;
+system stop ttl merges ttl;

 insert into ttl values (toDateTime('2000-10-10 00:00:00'), 1), (toDateTime('2000-10-10 00:00:00'), 2)
 insert into ttl values (toDateTime('2100-10-10 00:00:00'), 3), (toDateTime('2100-10-10 00:00:00'), 4);
@@ -11,7 +11,7 @@ select sleep(1) format Null; -- wait if very fast merge happen
 optimize table ttl partition 10 final;
 select * from ttl order by d, a;

-system start ttl merges;
+system start ttl merges ttl;
 optimize table ttl partition 10 final;
 select * from ttl order by d, a;


--- a/tests/queries/0_stateless/01666_merge_tree_max_query_limit.reference
+++ b/tests/queries/0_stateless/01666_merge_tree_max_query_limit.reference
@@ -12,4 +12,3 @@ Check if another query is passed
 Modify max_concurrent_queries back to 1
 Check if another query with less marks to read is throttled
 yes
-finished	long_running_query	default	select sleepEachRow(0.01) from simple settings max_block_size = 1 format Null
--- a/tests/queries/0_stateless/01666_merge_tree_max_query_limit.sh
+++ b/tests/queries/0_stateless/01666_merge_tree_max_query_limit.sh
@@ -18,9 +18,11 @@ settings index_granularity = 1, max_concurrent_queries = 1, min_marks_to_honor_m
 insert into simple select number, number + 100 from numbers(1000);
 "

+query_id="long_running_query-$CLICKHOUSE_DATABASE"
+
 echo "Spin up a long running query"
-${CLICKHOUSE_CLIENT} --query "select sleepEachRow(0.01) from simple settings max_block_size = 1 format Null" --query_id "long_running_query" > /dev/null 2>&1 &
-wait_for_query_to_start 'long_running_query'
+${CLICKHOUSE_CLIENT} --query "select sleepEachRow(0.01) from simple settings max_block_size = 1 format Null" --query_id "$query_id" > /dev/null 2>&1 &
+wait_for_query_to_start "$query_id"

 # query which reads marks >= min_marks_to_honor_max_concurrent_queries is throttled
 echo "Check if another query with some marks to read is throttled"
@@ -61,7 +63,7 @@ CODE=$?
 [ "$CODE" -ne "202" ] && echo "Expected error code: 202 but got: $CODE" && exit 1;
 echo "yes"

-${CLICKHOUSE_CLIENT} --query "KILL QUERY WHERE query_id = 'long_running_query' SYNC"
+${CLICKHOUSE_CLIENT} --query "KILL QUERY WHERE query_id = '$query_id' SYNC FORMAT Null"
 wait

 ${CLICKHOUSE_CLIENT} --multiline --multiquery --query "

--- a/tests/queries/skip_list.json
+++ b/tests/queries/skip_list.json
@@ -641,6 +641,7 @@
        "01542_dictionary_load_exception_race",
        "01545_system_errors", // looks at the difference of values in system.errors
        "01560_optimize_on_insert_zookeeper",
+        "01563_distributed_query_finish", // looks at system.errors which is global
        "01575_disable_detach_table_of_dictionary",
        "01593_concurrent_alter_mutations_kill",
        "01593_concurrent_alter_mutations_kill_many_replicas",
@@ -667,6 +668,7 @@
        "01702_system_query_log", // Runs many global system queries
        "01715_background_checker_blather_zookeeper",
        "01721_engine_file_truncate_on_insert", // It's ok to execute in parallel but not several instances of the same test.
+        "01722_long_brotli_http_compression_json_format", // it is broken in some unimaginable way with the genius error "cannot write to ofstream", not sure how to debug this
        "01747_alter_partition_key_enum_zookeeper",
        "01748_dictionary_table_dot", // creates database
        "01760_polygon_dictionaries",