diff --git a/dbms/src/IO/ReadHelpers.h b/dbms/src/IO/ReadHelpers.h index 6ec462e3ce5976f5f9fbf6df5c9b87c001f2a25f..b97a9c073c005351bb09f36b5768ceeb2618253c 100644 --- a/dbms/src/IO/ReadHelpers.h +++ b/dbms/src/IO/ReadHelpers.h @@ -35,6 +35,7 @@ namespace ErrorCodes extern const int CANNOT_PARSE_DATE; extern const int CANNOT_PARSE_DATETIME; extern const int CANNOT_READ_ARRAY_FROM_TEXT; + extern const int CANNOT_PARSE_NUMBER; } /// Helper functions for formatted input. @@ -243,7 +244,12 @@ ReturnType readIntTextImpl(T & x, ReadBuffer & buf) if (std::is_signed::value) negative = true; else - return ReturnType(false); + { + if (throw_exception) + throw Exception("Unsigned type must not contain '-' symbol", ErrorCodes::CANNOT_PARSE_NUMBER); + else + return ReturnType(false); + } break; case '0': case '1': @@ -887,6 +893,7 @@ inline T parse(const char * data, size_t size) T res; ReadBufferFromMemory buf(data, size); readText(res, buf); + assertEOF(buf); return res; } diff --git a/dbms/src/IO/tests/var_uint.cpp b/dbms/src/IO/tests/var_uint.cpp index 8a6c6f66733330c63e1fed25033a84f79ad74c01..5fe95205f3e13be826c366f187f7bbe752bac5d0 100644 --- a/dbms/src/IO/tests/var_uint.cpp +++ b/dbms/src/IO/tests/var_uint.cpp @@ -8,8 +8,32 @@ #include +static void parse_trash_string_as_uint_must_fail(const std::string & str) +{ + using namespace DB; + + unsigned x = 0xFF; + + try + { + x = parse(str); + } + catch (...) + { + /// Ok + return; + } + + std::cerr << "Parsing must fail, but finished sucessfully x=" << x; + exit(-1); +} + + int main(int argc, char ** argv) { + parse_trash_string_as_uint_must_fail("trash"); + parse_trash_string_as_uint_must_fail("-1"); + if (argc != 2) { std::cerr << "Usage: " << std::endl diff --git a/dbms/src/Server/HTTPHandler.cpp b/dbms/src/Server/HTTPHandler.cpp index fb2863c14dbc01d15a0ebd627fe0cc6a0aae18e0..fc9aaa52cf41b70b6accf62848b78037f3f9743e 100644 --- a/dbms/src/Server/HTTPHandler.cpp +++ b/dbms/src/Server/HTTPHandler.cpp @@ -136,7 +136,14 @@ static std::chrono::steady_clock::duration parseSessionTimeout(const HTMLForm & unsigned max_session_timeout = config.getUInt("max_session_timeout", 3600); std::string session_timeout_str = params.get("session_timeout"); - session_timeout = parse(session_timeout_str); + try + { + session_timeout = parse(session_timeout_str); + } + catch (...) + { + throw Exception(getCurrentExceptionMessage(false) + ". Invalid session timeout", ErrorCodes::INVALID_SESSION_TIMEOUT); + } if (session_timeout > max_session_timeout) throw Exception("Session timeout '" + session_timeout_str + "' is larger than max_session_timeout: " + toString(max_session_timeout) diff --git a/dbms/tests/queries/0_stateless/00463_sessions_in_http_interface.sh b/dbms/tests/queries/0_stateless/00463_sessions_in_http_interface.sh index 7fcbbca5bb5b4354a2a4c958bd7d31006106e7a0..c7fd1109a9c17447fddc03bc41ddf28c5d11d6a8 100755 --- a/dbms/tests/queries/0_stateless/00463_sessions_in_http_interface.sh +++ b/dbms/tests/queries/0_stateless/00463_sessions_in_http_interface.sh @@ -44,7 +44,7 @@ check "$url$session&session_check=0" "$select" "Exception" 0 "session_check=0 do request $url$session "SET max_rows_to_read=7777777" check "$url$session&session_timeout=string" "$select" "Exception.*Invalid session timeout" 1 "Non-numeric value accepted as a timeout." -check "$url$session&session_timeout=3601" "$select" "Exception.*Invalid session timeout" 1 "More then 3600 seconds accepted as a timeout." +check "$url$session&session_timeout=3601" "$select" "Exception.*Maximum session timeout*" 1 "More then 3600 seconds accepted as a timeout." check "$url$session&session_timeout=-1" "$select" "Exception.*Invalid session timeout" 1 "Negative timeout accepted." check "$url$session&session_timeout=0" "$select" "Exception" 0 "Zero timeout not accepted." check "$url$session&session_timeout=3600" "$select" "Exception" 0 "3600 second timeout not accepted." diff --git a/doc/index.html b/doc/index.html deleted file mode 100644 index c2d019e6a2409838e1cdd0827eb1955126fe4e64..0000000000000000000000000000000000000000 --- a/doc/index.html +++ /dev/null @@ -1,522 +0,0 @@ - - - - - - - ClickHouse — open-source distributed column-oriented DBMS - - - - - - - - - - - - - - - -
- -

ClickHouse

- - - -

ClickHouse is an open-source column-oriented database management system that allows generating analytical data reports in real time.

- - - -

ClickHouse manages extremely large volumes of data in a stable and sustainable manner. It currently powers Yandex.Metrica, world’s second largest web analytics platform, with over 20.3 trillion database records and over 20 billion events a day, generating customized reports on-the-fly, directly from non-aggregated data. This system was successfully implemented at CERN’s LHCb experiment to store and process metadata on 10bn events with over 1000 attributes per event registered in 2011.

- - -

ClickHouse. Just makes you think faster.

- -
    -
  • Run more queries in the same amount of time
  • -
  • Test more hypotheses
  • -
  • Slice and dice your data in many more new ways
  • -
  • Look at your data from new angles
  • -
  • Discover new dimensions
  • -
- - -

Linearly scalable

- -

ClickHouse allows companies to add servers to their clusters when necessary without investing time or money into additional DBMS modification. The system has been successfully serving Yandex.Metrica, while the servers just in its main cluster, located in six geographically distributed datacenters, have grown from 60 to 394 in two years.

- -

ClickHouse scales well both vertically and horizontally. ClickHouse is easily adaptable to perform both on hundreds of node clusters, and on a single server or even virtual machine. It currently has installations with more than two trillion rows per single node, as well as installations with 100 TB of storage per single node.

- - -

Hardware-efficient

- -

ClickHouse processes typical analytical queries two to three orders of magnitude faster than traditional row-oriented systems with the same available IO throughput. The system’s columnar format allows fitting more hot data in the server’s RAM, which leads to a shorter response time.

- -

ClickHouse allows to minimize number of seeks for range queries, which increases efficiency of using rotational drives, as it maintains locality of reference for stored data continually.

- -

ClickHouse is CPU efficient because of its vectorized query execution and runtime code generation.

- -

By minimizing data transfers for most types of queries, ClickHouse enables companies to manage their data and create reports without using a network that supports high-performance computing.

- - -

Fast

- -

ClickHouse’s performance exceeds comparable column-oriented DBMS currently available on the market. It processes hundreds of millions to more than a billion rows and tens of gigabytes of data per single server per second.

- -

ClickHouse uses all available hardware to its full potential to process each query as fast as possible. The peak processing performance for a single query (after decompression, only used columns) stands at more than 2 terabytes per second.

- - -

Fault-tolerant

- -

ClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. Downtime of a single node or the whole datacenter won’t affect the system’s availability for reads and writes. Distributed reads are automatically balanced to live replicas without increasing latency. Replicated data are synchronized automatically or semi-automatically after the downtime.

- - -

Feature-rich

- -

ClickHouse features a number of built-in user-friendly web analytics capabilities, including probabilistic data structures for fast and memory-efficient calculation of cardinalities and quantiles, or functions for addressing URLs and IPs (both IPv4 and IPv6) as well as identifying dates, times and time zones.

- -

Data management methods available in ClickHouse, such as arrays, array joins and nested data structures, are extremely efficient for managing denormalized data.

- -

Using ClickHouse allows joining both distributed data and co-located data, as the system supports local joins and distributed joins. It also offers an opportunity to use external dictionaries, dimension tables loaded from an external source, for seamless joins.

- -

ClickHouse supports approximate query processing – you can get results as fast as you want, which is indispensable when dealing with terabytes and petabytes of data.

- -

The system’s conditional aggregate functions, calculation of totals and extremes, allow getting results with a single query without having to run a number of them.

- - -

Simple and handy

- -

ClickHouse is simple and works out-of-the-box. As well as performing on hundreds of node clusters, this system can be easily installed on a single server or even a virtual machine. No development experience or code-writing skills are required to install ClickHouse.

- - -

Highly reliable

- -

ClickHouse has been managing petabytes of data serving a number of high-load mass-audience services of Russia’s leading search provider and one of Europe’s largest IT companies, Yandex. Since 2012, ClickHouse has been providing robust database management for the company’s web analytics service, comparison shopping platform, email service, online advertising platform, business intelligence and infrastructure monitoring.

- -

ClickHouse is purely distributed system located on independent nodes, which has no single point of failure.

- -

Software or hardware failures or misconfigurations do not result in loss of data. Instead of deleting "broken" data, Clickhouse saves it or asks you what to do before a startup. All data are checksummed before every read or write to disk or network. It is virtually impossible to delete data by accident.

- -

ClickHouse offers flexible limits on query complexity and resource usage, which can be fine-tuned using settings. It is possible to simultaneously serve both a number of high priority low-latency requests and some long-running queries with lowered priority.

- - -

Opens new possibilities

- -

ClickHouse streamlines all your data processing. It’s easy to use: ingest all your structured data into the system, and it is instantly available for reports. New columns for new properties or dimensions can be easily added to the system at any time without slowing it down.

- -

ClickHouse works 100-1,000x faster than traditional approaches. In contrast to data management methods, where vast amounts of raw data in its native format are available as a ‘data lake’ for any given query, ClickHouse, in most cases, offers instant results: the data is processed faster than it takes to make a query.

- - - - - - - -
-

Key Features

- -
    -
  • True column-oriented
  • -
  • Vectorized query execution
  • -
  • Data compression
  • -
  • Parallel and distributed query execution
  • -
  • Real-time data ingestion
  • -
  • On-disk locality of reference
  • -
  • Real-time query processing
  • -
  • Cross-datacenter replication
  • -
  • High availability
  • -
  • SQL support
  • -
  • Local and distributed joins
  • -
  • Pluggable external dimension tables
  • -
  • Arrays and nested data types
  • -
  • Approximate query processing
  • -
  • Probabilistic data structures
  • -
  • Full support of IPv6
  • -
  • Features for web analytics
  • -
  • State-of-the-art algorithms
  • -
  • Detailed documentation
  • -
  • Clean documented code
  • -
-
-

Applications

- -
    -
  • Web and App analytics
  • -
  • Advertising networks and RTB
  • -
  • Telecommunications
  • -
  • E-commerce
  • -
  • Information security
  • -
  • Monitoring and telemetry
  • -
  • Business intelligence
  • -
  • Online games
  • -
  • Internet of Things
  • -
-
- -
- Download -
- -
-

Download

- -

System requirements: Linux, x86_64 with SSE 4.2.

- -

Install packages for - Ubuntu 16.04 (Xenial) or Ubuntu 14.04 (Trusty) or Ubuntu 12.04 (Precise): -

- - -
-sudo apt-key adv --keyserver keyserver.ubuntu.com --recv E0C56BD4    # optional
-
-sudo mkdir -p /etc/apt/sources.list.d
-echo "deb http://repo.yandex.ru/clickhouse/trusty stable main" |
-    sudo tee /etc/apt/sources.list.d/clickhouse.list
-sudo apt-get update
-
-sudo apt-get install clickhouse-server-common clickhouse-client
-
-sudo service clickhouse-server start
-clickhouse-client
-
-
- -

Read the documentation.

- -

Or build ClickHouse from sources according to the instruction.

- -
- - - - - - - - - - - -
- - diff --git a/doc/reference/_static/custom.css b/doc/reference/_static/custom.css index 233131237a564e642f327a2fabcbe8c39f3e0db5..0a5ea7382d639b09506448e8de1f8a9c9bc543b5 100644 --- a/doc/reference/_static/custom.css +++ b/doc/reference/_static/custom.css @@ -22,13 +22,22 @@ pre { input { display: block; - margin-bottom: 4px; + margin: 0 0 4px 0; } a.reference { border-bottom: none; } +input[type="submit"] { + border: none!important; + background: #fc0; +} + +#svg-flag { + border: 1px solid #eee; +} + @font-face { font-family: 'Yandex Sans Text Web'; src: url(https://yastatic.net/adv-www/_/yy5JveR58JFkc97waf-xp0i6_jM.eot); diff --git a/doc/reference/_static/en.svg b/doc/reference/_static/en.svg new file mode 100644 index 0000000000000000000000000000000000000000..b4b05f9e2899f34c224945646c52a7246d0122e3 --- /dev/null +++ b/doc/reference/_static/en.svg @@ -0,0 +1,7 @@ + + + + + + + diff --git a/doc/reference/_static/ru.svg b/doc/reference/_static/ru.svg new file mode 100644 index 0000000000000000000000000000000000000000..0ba0378c931b570aaf2df3a61b4141d0246464f6 --- /dev/null +++ b/doc/reference/_static/ru.svg @@ -0,0 +1,6 @@ + + + + + + diff --git a/doc/reference/en/conf.py b/doc/reference/en/conf.py index a018fa8d4644fc28ceb303018829f56dc11cea14..b5c0b92b64fd841d67dce018a13fb38ca98daff3 100644 --- a/doc/reference/en/conf.py +++ b/doc/reference/en/conf.py @@ -125,7 +125,7 @@ html_theme_options = { 'link': '#08f', 'link_hover': 'red', 'extra_nav_links': collections.OrderedDict([ - ('Switch to Russian 🇷🇺 ', '/docs/ru/'), + ('Switch to Russian ', '/docs/ru/'), ('Single page documentation', '/docs/en/single/'), ('Website home', '/'), ('GitHub', 'https://github.com/yandex/ClickHouse'), diff --git a/doc/reference/en/table_engines/resharding.rst b/doc/reference/en/table_engines/resharding.rst index 6987d84445356963d8a742f5c32c2b40402ace97..5bb6b733d06243c8b37223861841de04d1597518 100644 --- a/doc/reference/en/table_engines/resharding.rst +++ b/doc/reference/en/table_engines/resharding.rst @@ -1,4 +1,4 @@ -Перешардирование +Resharding ---------------- .. code-block:: sql diff --git a/doc/reference/ru/conf.py b/doc/reference/ru/conf.py index 00e26c601979fbc256de7571bac3440a9064fc05..a84651463e8b4861480da70e1ade2a72bfdd3057 100644 --- a/doc/reference/ru/conf.py +++ b/doc/reference/ru/conf.py @@ -125,7 +125,7 @@ html_theme_options = { 'link': '#08f', 'link_hover': 'red', 'extra_nav_links': collections.OrderedDict([ - ('Switch to English 🇬🇧', '/docs/en/'), + ('Switch to English ', '/docs/en/'), ('Документация на одной странице', '/docs/ru/single/'), ('Главная страница сайта', '/'), ('GitHub', 'https://github.com/yandex/ClickHouse'), diff --git a/doc/tutorial.html b/doc/tutorial.html deleted file mode 100644 index 8d4d4e27e6ff886ae388d78b47f931d78f12f105..0000000000000000000000000000000000000000 --- a/doc/tutorial.html +++ /dev/null @@ -1,711 +0,0 @@ - - - - - ClickHouse — quick start guide - - - - - - - - - - - - - - - - - - - - -
- - - -

Let's get started with sample dataset from open sources. We will use USA civil flights data since 1987 till 2015. It's hard to call this sample a Big Data (contains 166 millions rows, 63 Gb of uncompressed data) but this allows us to quickly get to work. Dataset is available for download here. Also you may download it from the original datasource as described here.

- -

Firstly we will deploy ClickHouse to a single server. Below that we will also review the process of deployment to a cluster with support for sharding and replication.

- -

On Ubuntu and Debian Linux ClickHouse can be installed from packages. For other Linux distributions you can compile ClickHouse from sources and then install.

- -

clickhouse-client package contains clickhouse-client application — interactive ClickHouse client. clickhouse-server-base contains a clickhouse-server binary file. clickhouse-server-common — contains config files for the clickhouse-server.

- -

Server config files are located in /etc/clickhouse-server/. Before getting to work please notice the path element in config. Path determines the location for data storage. It's not really handy to directly edit config.xml file considering package updates. Recommended way is to override the config elements in files of config.d directory. -Also you may want to set up access rights at the start.

- -

clickhouse-server won't be launched automatically after package installation. It won't be automatically restarted after updates either. Start the server with: -

sudo service clickhouse-server start
-Default location for server logs is /var/log/clickhouse-server/ -Server is ready to handle client conections once "Ready for connections" message was logged.

- -

Use clickhouse-client to connect to the server.

- -
Tips for clickhouse-client -
-Interactive mode: -
-clickhouse-client
-clickhouse-client --host=... --port=... --user=... --password=...
-
-Enable multiline queries: -
-clickhouse-client -m
-clickhouse-client --multiline
-
-Run queries in batch-mode: -
-clickhouse-client --query='SELECT 1'
-echo 'SELECT 1' | clickhouse-client
-
-Inser data from file of a specified format: -
-clickhouse-client --query='INSERT INTO table VALUES' < data.txt
-clickhouse-client --query='INSERT INTO table FORMAT TabSeparated' < data.tsv
-
-
- -

Create table for sample dataset

-
Create table query -
-
-$ clickhouse-client --multiline
-ClickHouse client version 0.0.53720.
-Connecting to localhost:9000.
-Connected to ClickHouse server version 0.0.53720.
-
-:) CREATE TABLE ontime
-(
-    Year UInt16,
-    Quarter UInt8,
-    Month UInt8,
-    DayofMonth UInt8,
-    DayOfWeek UInt8,
-    FlightDate Date,
-    UniqueCarrier FixedString(7),
-    AirlineID Int32,
-    Carrier FixedString(2),
-    TailNum String,
-    FlightNum String,
-    OriginAirportID Int32,
-    OriginAirportSeqID Int32,
-    OriginCityMarketID Int32,
-    Origin FixedString(5),
-    OriginCityName String,
-    OriginState FixedString(2),
-    OriginStateFips String,
-    OriginStateName String,
-    OriginWac Int32,
-    DestAirportID Int32,
-    DestAirportSeqID Int32,
-    DestCityMarketID Int32,
-    Dest FixedString(5),
-    DestCityName String,
-    DestState FixedString(2),
-    DestStateFips String,
-    DestStateName String,
-    DestWac Int32,
-    CRSDepTime Int32,
-    DepTime Int32,
-    DepDelay Int32,
-    DepDelayMinutes Int32,
-    DepDel15 Int32,
-    DepartureDelayGroups String,
-    DepTimeBlk String,
-    TaxiOut Int32,
-    WheelsOff Int32,
-    WheelsOn Int32,
-    TaxiIn Int32,
-    CRSArrTime Int32,
-    ArrTime Int32,
-    ArrDelay Int32,
-    ArrDelayMinutes Int32,
-    ArrDel15 Int32,
-    ArrivalDelayGroups Int32,
-    ArrTimeBlk String,
-    Cancelled UInt8,
-    CancellationCode FixedString(1),
-    Diverted UInt8,
-    CRSElapsedTime Int32,
-    ActualElapsedTime Int32,
-    AirTime Int32,
-    Flights Int32,
-    Distance Int32,
-    DistanceGroup UInt8,
-    CarrierDelay Int32,
-    WeatherDelay Int32,
-    NASDelay Int32,
-    SecurityDelay Int32,
-    LateAircraftDelay Int32,
-    FirstDepTime String,
-    TotalAddGTime String,
-    LongestAddGTime String,
-    DivAirportLandings String,
-    DivReachedDest String,
-    DivActualElapsedTime String,
-    DivArrDelay String,
-    DivDistance String,
-    Div1Airport String,
-    Div1AirportID Int32,
-    Div1AirportSeqID Int32,
-    Div1WheelsOn String,
-    Div1TotalGTime String,
-    Div1LongestGTime String,
-    Div1WheelsOff String,
-    Div1TailNum String,
-    Div2Airport String,
-    Div2AirportID Int32,
-    Div2AirportSeqID Int32,
-    Div2WheelsOn String,
-    Div2TotalGTime String,
-    Div2LongestGTime String,
-    Div2WheelsOff String,
-    Div2TailNum String,
-    Div3Airport String,
-    Div3AirportID Int32,
-    Div3AirportSeqID Int32,
-    Div3WheelsOn String,
-    Div3TotalGTime String,
-    Div3LongestGTime String,
-    Div3WheelsOff String,
-    Div3TailNum String,
-    Div4Airport String,
-    Div4AirportID Int32,
-    Div4AirportSeqID Int32,
-    Div4WheelsOn String,
-    Div4TotalGTime String,
-    Div4LongestGTime String,
-    Div4WheelsOff String,
-    Div4TailNum String,
-    Div5Airport String,
-    Div5AirportID Int32,
-    Div5AirportSeqID Int32,
-    Div5WheelsOn String,
-    Div5TotalGTime String,
-    Div5LongestGTime String,
-    Div5WheelsOff String,
-    Div5TailNum String
-)
-ENGINE = MergeTree(FlightDate, (Year, FlightDate), 8192);
-
-
- -

Now we have a table of MergeTree type. MergeTree table type is recommended for usage in production. Table of this kind has a primary key used for incremental sort of table data. This allows fast execution of queries in ranges of a primary key.

- - -

Note -We store ad network banners impressions logs in ClickHouse. Each table entry looks like: -[Advertiser ID, Impression ID, attribute1, attribute2, …] -Let assume that our aim is to provide a set of reports for each advertiser. Common and frequently demanded query would be to count impressions for a specific Advertiser ID. This means that table primary key should start with Advertiser ID. In this case ClickHouse needs to read smaller amount of data to perform the query for a given Advertiser ID.

- -

Load data

-
xz -v -c -d < ontime.csv.xz | clickhouse-client --query="INSERT INTO ontime FORMAT CSV"
-

ClickHouse INSERT query allows to load data in any supported format. Data load requires just O(1) RAM consumption. INSERT query can receive any data volume as input. It's strongly recommended to insert data with not too small size blocks. Notice that insert of blocks with size up to max_insert_block_size (= 1 048 576 rows by default) is an atomic operation: data block will be inserted completely or not inserted at all. In case of disconnect during insert operation you may not know if the block was inserted successfully. To achieve exactly-once semantics ClickHouse supports idempotency for replicated tables. This means that you may retry insert of the same data block (possibly on a different replicas) but this block will be inserted just once. Anyway in this guide we will load data from our localhost so we may not take care about data blocks generation and exactly-once semantics.

- -

INSERT query into tables of MergeTree type is non-blocking (so does a SELECT query). You can execute SELECT queries right after of during insert operation.

- -

Our sample dataset is a bit not optimal. There are two reasons.

- -

The first is that String data type is used in cases when Enum or numeric type would fit best.

- -

When set of possible values is determined and known to be small. (E.g. OS name, browser vendors etc.) it's recommended to use Enums or numbers to improve performance. -When set of possible values is not limited (search query, URL, etc.) just go ahead with String.

- -

The second is that dataset contains redundant fields like Year, Quarter, Month, DayOfMonth, DayOfWeek. In fact a single FlightDate would be enough. Most likely they have been added to improve performance for other DBMS'es which DateTime handling functions may be not efficient.

- -

ClickHouse functions for operating with DateTime fields are well-optimized so such redundancy is not required. Anyway much columns is not a reason to worry — ClickHouse is a column-oriented DBMS. This allows you to have as much fields as you need. Hundreds of columns in a table is fine for ClickHouse.

- -

Querying the sample dataset

- -

Here are some examples of the queries from our test data.

- -
    -
  • the most popular destinations in 2015; -
    -
    -SELECT
    -    OriginCityName,
    -    DestCityName,
    -    count(*) AS flights,
    -    bar(flights, 0, 20000, 40)
    -FROM ontime WHERE Year = 2015 GROUP BY OriginCityName, DestCityName ORDER BY flights DESC LIMIT 20
    -
    -
    -SELECT
    -    OriginCityName < DestCityName ? OriginCityName : DestCityName AS a,
    -    OriginCityName < DestCityName ? DestCityName : OriginCityName AS b,
    -    count(*) AS flights,
    -    bar(flights, 0, 40000, 40)
    -FROM ontime WHERE Year = 2015 GROUP BY a, b ORDER BY flights DESC LIMIT 20
    -
    -
  • -
  • the most popular cities of departure; -
    -
    -SELECT OriginCityName, count(*) AS flights
    -FROM ontime GROUP BY OriginCityName ORDER BY flights DESC LIMIT 20
    -
    -
  • -
  • cities of departure which offer maximum variety of destinations; -
    -
    -SELECT OriginCityName, uniq(Dest) AS u
    -FROM ontime GROUP BY OriginCityName ORDER BY u DESC LIMIT 20
    -
    -
  • -
  • flight delay dependence on the day of week; -
    -
    -SELECT DayOfWeek, count() AS c, avg(DepDelay >  60) AS delays
    -FROM ontime GROUP BY DayOfWeek ORDER BY DayOfWeek
    -
    -
  • -
  • cities of departure with most frequent delays for 1 hour or longer; -
    -
    -SELECT OriginCityName, count() AS c, avg(DepDelay >  60) AS delays
    -FROM ontime
    -GROUP BY OriginCityName
    -HAVING c >  100000
    -ORDER BY delays DESC
    -LIMIT 20
    -
    -
  • -
  • flights of maximum duration; -
    -
    -SELECT OriginCityName, DestCityName, count(*) AS flights, avg(AirTime) AS duration
    -FROM ontime
    -GROUP BY OriginCityName, DestCityName
    -ORDER BY duration DESC
    -LIMIT 20
    -
    -
  • -
  • distribution of arrival time delays split by aircompanies; -
    -
    -SELECT Carrier, count() AS c, round(quantileTDigest(0.99)(DepDelay), 2) AS q
    -FROM ontime GROUP BY Carrier ORDER BY q DESC
    -
    -
  • -
  • aircompanies who stopped flights operation; -
    -
    -SELECT Carrier, min(Year), max(Year), count()
    -FROM ontime GROUP BY Carrier HAVING max(Year) < 2015 ORDER BY count() DESC
    -
    -
  • -
  • most trending destination cities in 2015; -
    -
    -SELECT
    -    DestCityName,
    -    sum(Year = 2014) AS c2014,
    -    sum(Year = 2015) AS c2015,
    -    c2015 / c2014 AS diff
    -FROM ontime
    -WHERE Year IN (2014, 2015)
    -GROUP BY DestCityName
    -HAVING c2014 >  10000 AND c2015 >  1000 AND diff >  1
    -ORDER BY diff DESC
    -
    -
  • -
  • destination cities with maximum popularity-season dependency. -
    -
    -SELECT
    -    DestCityName,
    -    any(total),
    -    avg(abs(monthly * 12 - total) / total) AS avg_month_diff
    -FROM
    -(
    -    SELECT DestCityName, count() AS total
    -    FROM ontime GROUP BY DestCityName HAVING total > 100000
    -)
    -ALL INNER JOIN
    -(
    -    SELECT DestCityName, Month, count() AS monthly
    -    FROM ontime GROUP BY DestCityName, Month HAVING monthly > 10000
    -)
    -USING DestCityName
    -GROUP BY DestCityName
    -ORDER BY avg_month_diff DESC
    -LIMIT 20
    -
    -
  • -
- -

ClickHouse deployment to cluster

-

ClickHouse cluster is a homogenous cluster. Steps to set up: -

    -
  1. Install ClickHouse server on all machines of the cluster
  2. -
  3. Set up cluster configs in configuration file
  4. -
  5. Create local tables on each instance
  6. -
  7. Create a Distributed table
  8. -
-

- -

Distributed-table is actually a kind of "view" to local tables of ClickHouse cluster. SELECT query from a distributed table will be executed using resources of all cluster's shards. You may specify configs for multiple clusters and create multiple Distributed-tables providing views to different clusters.

- - -

Enable network access to clickhouse: -

<listen_host>::</listen_host>
-

- -
Config for cluster of three shards. Each shard stores data on a single replica -
-
-<remote_servers>
-    <perftest_3shards_1replicas>
-        <shard>
-            <replica>
-                <host>example-perftest01j.yandex.ru</host>
-                <port>9000</port>
-            </replica>
-        </shard>
-        <shard>
-            <replica>
-                <host>example-perftest02j.yandex.ru</host>
-                <port>9000</port>
-            </replica>
-        </shard>
-        <shard>
-            <replica>
-                <host>example-perftest03j.yandex.ru</host>
-                <port>9000</port>
-            </replica>
-        </shard>
-    </perftest_3shards_1replicas>
-</remote_servers>
-
-
-Creating a local table: -
CREATE TABLE ontime_local (...) ENGINE = MergeTree(FlightDate, (Year, FlightDate), 8192);
-Creating a distributed table providing a view into local tables of the cluster: -
CREATE TABLE ontime_all AS ontime_local
-    ENGINE = Distributed(perftest_3shards_1replicas, default, ontime_local, rand());
- -

You can create a Distributed table on all machines in the cluster. This would allow to run distributed queries on any machine of the cluster. Besides distributed table you can also use *remote* table function.

- -

Let's run INSERT SELECT into Distributed table to spread the table to multiple servers.

- -
INSERT INTO ontime_all SELECT * FROM ontime;
- -

Worth to notice that the approach given above wouldn't fit for sharding of large tables.
Please use built-in sharding feature.

- -

As you could expect heavy queries are executed N times faster being launched on 3 servers instead of one.

-
See here -
- - -

You may have noticed that quantiles calculation are slightly different. This happens due to t-digest algorithm implementation which is non-deterministic — it depends on the order of data processing.

-
- -

In this case we have used a cluster with 3 shards each contains a single replica.

- -

To provide for resilience in production environment we recommend that each shard should contain 2-3 replicas distributed between multiple data-centers. Note that ClickHouse supports unlimited number of replicas.

- -
Config for cluster of one shard containing three replicas -
-
-<remote_servers>
-    ...
-    <perftest_1shards_3replicas>
-        <shard>
-            <replica>
-                <host>example-perftest01j.yandex.ru</host>
-                <port>9000</port>
-             </replica>
-             <replica>
-                <host>example-perftest02j.yandex.ru</host>
-                <port>9000</port>
-             </replica>
-             <replica>
-                <host>example-perftest03j.yandex.ru</host>
-                <port>9000</port>
-             </replica>
-        </shard>
-    </perftest_1shards_3replicas>
-</remote_servers>
-
-
- -

To enable replication ZooKeeper is required. ClickHouse will take care of data consistency on all replicas and run restore procedure after failure automatically. It's recommended to deploy ZooKeeper cluster to separate servers.

- -

ZooKeeper is not a requirement — in some simple cases you can duplicate the data by writing it into all the replicas from your application code. This approach is not recommended — in this case ClickHouse is not able to guarantee data consistency on all replicas. This remains the responsibility of your application.

- -
Set ZooKeeper locations in configuration file -
-
-<zookeeper-servers>
-    <node>
-        <host>zoo01.yandex.ru</host>
-        <port>2181</port>
-    </node>
-    <node>
-        <host>zoo02.yandex.ru</host>
-        <port>2181</port>
-    </node>
-    <node>
-        <host>zoo03.yandex.ru</host>
-        <port>2181</port>
-    </node>
-</zookeeper-servers>
-
-
- -

Also we need to set macros for identifying shard and replica — it will be used on table creation

-
-<macros>
-    <shard>01</shard>
-    <replica>01</replica>
-</macros>
-
-

If there are no replicas at the moment on replicated table creation — a new first replica will be instantiated. If there are already live replicas — new replica will clone the data from existing ones. You have an option to create all replicated tables first and that insert data to it. Another option is to create some replicas and add the others after or during data insertion.

- -
-CREATE TABLE ontime_replica (...)
-ENGINE = ReplicatedMergeTree(
-    '/clickhouse_perftest/tables/{shard}/ontime',
-    '{replica}',
-    FlightDate,
-    (Year, FlightDate),
-    8192);
-
-

Here we use ReplicatedMergeTree table type. In parameters we specify ZooKeeper path containing shard and replica identifiers.

- -
INSERT INTO ontime_replica SELECT * FROM ontime;
-

Replication operates in multi-master mode. Data can be loaded into any replica — it will be synced with other instances automatically. Replication is asynchronous so at a given moment of time not all replicas may contain recently inserted data. To allow data insertion at least one replica should be up. Others will sync up data and repair consistency once they will become active again. Please notice that such scheme allows for the possibility of just appended data loss.

- -

Feedback

-

Ask any questions on Stackoverflow. Use Google Group for discussion.
Or send private message to developers: clickhouse-feedback@yandex-team.com.

-

Software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- - -
- - - - - diff --git a/release b/release index fa193f3a9115c74ba9340ffa34b05eb455d96899..d85357fc185dd0b619c185c6c0ff39d6dd05028c 100755 --- a/release +++ b/release @@ -1,5 +1,7 @@ #!/bin/bash +set -e + CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) cd $CURDIR diff --git a/release_lib.sh b/release_lib.sh index 65d22cfa713fac692f91a093700bcc55f1464039..56ef68bac10fc857a1047e3578bbe0e45595b292 100644 --- a/release_lib.sh +++ b/release_lib.sh @@ -12,14 +12,8 @@ function make_control { # set environment variables REVISION, AUTHOR function gen_revision_author { REVISION=$(get_revision) - - if [ -z $VERSION_PREFIX ] ; then - VERSION_PREFIX="v1.1." - fi - - if [ -z $VERSION_POSTFIX ] ; then - VERSION_POSTFIX="-testing" - fi + VERSION_PREFIX="${VERSION_PREFIX:-v1.1.}" + VERSION_POSTFIX="${VERSION_POSTFIX:--testing}" if [[ $STANDALONE != 'yes' ]]; then diff --git a/website/README.md b/website/README.md index 45e7dac900ece2f2046319a144f37df91d2e0e95..b0034a22979cd533cf88e07a5fa623f404c67bb8 100755 --- a/website/README.md +++ b/website/README.md @@ -1,4 +1,6 @@ ClickHouse website quickstart: -1. If npm is not installed: `apt-get install npm` for Debian/Ubuntu, `brew install npm` for Mac OS or download and install manually https://nodejs.org/en/download/ -2. Run setup\_gulp.sh once to install prerequisites via npm -3. Use `gulp build` to minify website to "public" folder or just `gulp` to run local webserver with livereload serving it +1. Make sure you have `npm`, `docker` and `python` installed and available in your `$PATH`. +2. Run `setup\_gulp.sh` once to install build prerequisites via npm. +3. Use `gulp build` to minify website to "public" subfolder or just `gulp` to run local webserver with livereload serving it (note: livereload browser extension is required to make it actually reload pages on edits automatically). +4. There's Dockerfile that can be used to build and run ClickHouse website inside docker. +4. Deployment to https://clickhouse.yandex is managed by `release.sh`, but it is only usable from inside Yandex private network. diff --git a/website/gulpfile.js b/website/gulpfile.js index 427b318aa0fb4657546abf5ecc1b5c96db0924bf..096f7388a203dd4d157ec47e8cea3a978d8abb88 100644 --- a/website/gulpfile.js +++ b/website/gulpfile.js @@ -49,7 +49,13 @@ gulp.task('reference', [], function () { .pipe(gulp.dest(outputDir)) }); -gulp.task('docs', [], function () { +gulp.task('docstxt', [], function () { + run('cd ' + docsDir + '; make'); + return gulp.src(paths.docs) + .pipe(gulp.dest(outputDir + '/../docs')) +}); + +gulp.task('docs', ['docstxt'], function () { run('cd ' + docsDir + '; make'); return gulp.src(paths.docs) .pipe(gulp.dest(outputDir + '/../docs')) diff --git a/website/index.html b/website/index.html index c3d0073ad7a63ee151ae9d3b23e2682ca1e4b351..f4315c10d0130d189353487e0498c2bee275301e 100644 --- a/website/index.html +++ b/website/index.html @@ -847,6 +847,11 @@ clickhouse-client window.history.replaceState('', document.title, window.location.href.replace(location.hash, '') + this.hash); }); + var hostParts = window.location.host.split('.'); + if (hostParts.length > 2 && hostParts[0] != 'test') { + window.location.host = hostParts[0] + '.' + hostParts[1]; + } + var available_distributives = ['xenial', 'trusty', 'precise']; available_distributives.forEach(function (name) { $('#ubuntu_' + name).on('click', function () { diff --git a/website/reference_en.html b/website/reference_en.html index 2cbbdbb2d677fdc1ad0450ac0e94597c9db486e7..028e1d3299cc93930c97f88adb05cd33d9925c90 100644 --- a/website/reference_en.html +++ b/website/reference_en.html @@ -50,15 +50,6 @@ function getParams() {