提交 560981ac 编写于 作者: F fjy

a ton of fixes to docs

上级 cf4711d9
......@@ -23,6 +23,7 @@ import com.google.common.base.Supplier;
import com.google.common.base.Throwables;
import com.google.common.collect.Maps;
import com.google.inject.Inject;
import com.metamx.common.ISE;
import com.metamx.common.concurrent.ScheduledExecutors;
import com.metamx.common.lifecycle.LifecycleStart;
import com.metamx.common.lifecycle.LifecycleStop;
......@@ -38,6 +39,7 @@ import org.skife.jdbi.v2.tweak.ResultSetMapper;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentMap;
......@@ -76,7 +78,7 @@ public class ConfigManager
final String configTable = dbTables.get().getConfigTable();
this.selectStatement = String.format("SELECT payload FROM %s WHERE name = :name", configTable);
insertStatement = String.format(
this.insertStatement = String.format(
"INSERT INTO %s (name, payload) VALUES (:name, :payload) ON DUPLICATE KEY UPDATE payload = :payload",
configTable
);
......@@ -186,19 +188,29 @@ public class ConfigManager
@Override
public byte[] withHandle(Handle handle) throws Exception
{
return handle.createQuery(selectStatement)
.bind("name", key)
.map(
new ResultSetMapper<byte[]>()
{
@Override
public byte[] map(int index, ResultSet r, StatementContext ctx) throws SQLException
{
return r.getBytes("payload");
}
}
)
.first();
List<byte[]> matched = handle.createQuery(selectStatement)
.bind("name", key)
.map(
new ResultSetMapper<byte[]>()
{
@Override
public byte[] map(int index, ResultSet r, StatementContext ctx)
throws SQLException
{
return r.getBytes("payload");
}
}
).list();
if (matched.isEmpty()) {
return null;
}
if (matched.size() > 1) {
throw new ISE("Error! More than one matching entry[%d] found for [%s]?!", matched.size(), key);
}
return matched.get(0);
}
}
);
......
......@@ -11,9 +11,6 @@
<a href="mailto:info@druid.io">info@druid.io</a>
</address>
<address>
<strong>Metamarkets</strong>
625 2nd Street, Suite #230<br>
San Francisco, CA 94017<br>
<div class="soc">
<a href="https://twitter.com/druidio"></a>
<a href="https://github.com/metamx/druid" class="github"></a>
......@@ -25,7 +22,7 @@
<li><a href="/"><strong>DRUID</strong></a></li>
<li><a href="/druid.html">What is Druid?</a></li>
<li><a href="/downloads.html">Downloads</a></li>
<li><a target="_blank" href="https://github.com/metamx/druid/wiki">Documentation</a></li>
<li><a target="_blank" href="Home.html">Documentation</a></li>
</ul>
<ul class="col-md-4 list-unstyled">
<li><a href="/community.html"><strong>SUPPORT</strong></a></li>
......
......@@ -3,7 +3,7 @@ layout: doc_page
---
# Booting a Single Node Cluster #
[Loading Your Data](Loading-Your-Data.html) and [Querying Your Data](Querying-Your-Data.html) contain recipes to boot a small druid cluster on localhost. Here we will boot a small cluster on EC2. You can checkout the code, or download a tarball from [here](http://static.druid.io/artifacts/druid-services-0.6.0-bin.tar.gz).
[Loading Your Data](Tutorial%3A-Loading-Your-Data-Part-2.html) and [All About Queries](Tutorial%3A-All-About-Queries.html) contain recipes to boot a small druid cluster on localhost. Here we will boot a small cluster on EC2. You can checkout the code, or download a tarball from [here](http://static.druid.io/artifacts/druid-services-0.6.0-bin.tar.gz).
The [ec2 run script](https://github.com/metamx/druid/blob/master/examples/bin/run_ec2.sh), run_ec2.sh, is located at 'examples/bin' if you have checked out the code, or at the root of the project if you've downloaded a tarball. The scripts rely on the [Amazon EC2 API Tools](http://aws.amazon.com/developertools/351), and you will need to set three environment variables:
......
---
layout: doc_page
---
A Druid cluster consists of various node types that need to be set up depending on your use case. See our [[Design]] docs for a description of the different node types.
h2. Setup Scripts
One of our community members, "housejester":https://github.com/housejester/, contributed some scripts to help with setting up a cluster. Checkout the "github":https://github.com/housejester/druid-test-harness and "wiki":https://github.com/housejester/druid-test-harness/wiki/Druid-Test-Harness.
h2. Minimum Physical Layout: Absolute Minimum
As a special case, the absolute minimum setup is one of the standalone examples for realtime ingestion and querying; see [[Examples]] that can easily run on one machine with one core and 1GB RAM. This layout can be set up to try some basic queries with Druid.
h2. Minimum Physical Layout: Experimental Testing with 4GB of RAM
This layout can be used to load some data from deep storage onto a Druid compute node for the first time. A minimal physical layout for a 1 or 2 core machine with 4GB of RAM is:
# node1: [[Master]] + metadata service + zookeeper + [[Compute]]
# transient nodes: indexer
This setup is only reasonable to prove that a configuration works. It would not be worthwhile to use this layout for performance measurement.
h2. Comfortable Physical Layout: Pilot Project with Multiple Machines
_The machine size "flavors" are using AWS/EC2 terminology for descriptive purposes only and is not meant to imply that AWS/EC2 is required or recommended. Another cloud provider or your own hardware can also work._
A minimal physical layout not constrained by cores that demonstrates parallel querying and realtime, using AWS-EC2 "small"/m1.small (one core, with 1.7GB of RAM) or larger, no realtime, is:
# node1: [[Master]] (m1.small)
# node2: metadata service (m1.small)
# node3: zookeeper (m1.small)
# node4: [[Broker]] (m1.small or m1.medium or m1.large)
# node5: [[Compute]] (m1.small or m1.medium or m1.large)
# node6: [[Compute]] (m1.small or m1.medium or m1.large)
# node7: [[Realtime]] (m1.small or m1.medium or m1.large)
# transient nodes: indexer
This layout naturally lends itself to adding more RAM and core to Compute nodes, and to adding many more Compute nodes. Depending on the actual load, the Master, metadata server, and Zookeeper might need to use larger machines.
h2. High Availability Physical Layout
_The machine size "flavors" are using AWS/EC2 terminology for descriptive purposes only and is not meant to imply that AWS/EC2 is required or recommended. Another cloud provider or your own hardware can also work._
An HA layout allows full rolling restarts and heavy volume:
# node1: [[Master]] (m1.small or m1.medium or m1.large)
# node2: [[Master]] (m1.small or m1.medium or m1.large) (backup)
# node3: metadata service (c1.medium or m1.large)
# node4: metadata service (c1.medium or m1.large) (backup)
# node5: zookeeper (c1.medium)
# node6: zookeeper (c1.medium)
# node7: zookeeper (c1.medium)
# node8: [[Broker]] (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge)
# node9: [[Broker]] (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge) (backup)
# node10: [[Compute]] (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge)
# node11: [[Compute]] (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge)
# node12: [[Realtime]] (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge)
# transient nodes: indexer
h2. Sizing for Cores and RAM
The Compute and Broker nodes will use as many cores as are available, depending on usage, so it is best to keep these on dedicated machines. The upper limit of effectively utilized cores is not well characterized yet and would depend on types of queries, query load, and the schema. Compute daemons should have a heap a size of at least 1GB per core for normal usage, but could be squeezed into a smaller heap for testing. Since in-memory caching is essential for good performance, even more RAM is better. Broker nodes will use RAM for caching, so they do more than just route queries.
The effective utilization of cores by Zookeeper, MySQL, and Master nodes is likely to be between 1 and 2 for each process/daemon, so these could potentially share a machine with lots of cores. These daemons work with heap a size between 500MB and 1GB.
h2. Storage
Indexed segments should be kept in a permanent store accessible by all nodes like AWS S3 or HDFS or equivalent. Currently Druid supports S3, but this will be extended soon.
Local disk ("ephemeral" on AWS EC2) for caching is recommended over network mounted storage (example of mounted: AWS EBS, Elastic Block Store) in order to avoid network delays during times of heavy usage. If your data center is suitably provisioned for networked storage, perhaps with separate LAN/NICs just for storage, then mounted might work fine.
h2. Setup
Setting up a cluster is essentially just firing up all of the nodes you want with the proper [[configuration]]. One thing to be aware of is that there are a few properties in the configuration that potentially need to be set individually for each process:
<pre>
<code>
druid.server.type=historical|realtime
druid.host=someHostOrIPaddrWithPort
druid.port=8080
</code>
</pre>
@druid.server.type@ should be set to "historical" for your compute nodes and realtime for the realtime nodes. The master will only assign segments to a "historical" node and the broker has some intelligence around its ability to cache results when talking to a realtime node. This does not need to be set for the master or the broker.
@druid.host@ should be set to the hostname and port that can be used to talk to the given server process. Basically, someone should be able to send a request to http://${druid.host}/ and actually talk to the process.
@druid.port@ should be set to the port that the server should listen on. In the vast majority of cases, this port should be the same as what is on @druid.host@.
h2. Build/Run
The simplest way to build and run from the repository is to run @mvn package@ from the base directory and then take @druid-services/target/druid-services-*-selfcontained.jar@ and push that around to your machines; the jar does not need to be expanded, and since it contains the main() methods for each kind of service, it is *not* invoked with java -jar. It can be run from a normal java command-line by just including it on the classpath and then giving it the main class that you want to run. For example one instance of the Compute node/service can be started like this:
<pre>
<code>
java -Duser.timezone=UTC -Dfile.encoding=UTF-8 -cp compute/:druid-services/target/druid-services-*-selfcontained.jar com.metamx.druid.http.ComputeMain
</code>
</pre>
The following table shows the possible services and fully qualified class for main().
|_. service |_. main class |
| [[ Realtime ]] | com.metamx.druid.realtime.RealtimeMain |
| [[ Master ]] | com.metamx.druid.http.MasterMain |
| [[ Broker ]] | com.metamx.druid.http.BrokerMain |
| [[ Compute ]] | com.metamx.druid.http.ComputeMain |
\ No newline at end of file
......@@ -2,9 +2,8 @@
layout: doc_page
---
Druid is an open-source analytics datastore designed for realtime, exploratory, queries on large-scale data sets (100’s of Billions entries, 100’s TB data). Druid provides for cost effective, always-on, realtime data ingestion and arbitrary data exploration.
Druid is an open-source analytics data store designed for real-time, exploratory, queries on large-scale data sets (100’s of Billions entries, 100’s TB data). Druid provides for cost effective, always-on, realtime data ingestion and arbitrary data exploration.
- Check out some [Examples](Examples.html)
- Try out Druid with our Getting Started [Tutorial](./Tutorial%3A-A-First-Look-at-Druid.html)
- Learn more by reading the [White Paper](http://static.druid.io/docs/druid.pdf)
......
......@@ -6,7 +6,7 @@ The indexing service is a highly-available, distributed service that runs indexi
The indexing service is composed of three main components: a peon component that can run a single task, a middle manager component that manages peons, and an overlord component that manages task distribution to middle managers.
Overlords and middle managers may run on the same node or across multiple nodes while middle managers and peons always run on the same node.
Most Basic Getting Started Configuration
Quick Start
----------------------------------------
Run:
......@@ -149,7 +149,7 @@ http://<COORDINATOR_IP>:<port>/druid/indexer/v1/worker/setup
A sample worker setup spec is shown below:
```
```json
{
"minVersion":"some_version",
"minNumWorkers":"0",
......
---
layout: doc_page
---
# Setup #
Before we start querying druid, we're going to finish setting up a complete cluster on localhost. In [Loading Your Data](Loading-Your-Data.html) we setup a [Realtime](Realtime.html), [Historical](Historical.html) and [Coordinator](Coordinator.html) node. If you've already completed that tutorial, you need only follow the directions for 'Booting a Broker Node'.
## Booting a Broker Node ##
1. Setup a config file at config/broker/runtime.properties that looks like this:
```
druid.host=localhost
druid.service=broker
druid.port=8080
druid.zk.service.host=localhost
```
2. Run the broker node:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/broker io.druid.cli.Main server broker
```
With the Broker node and the other Druid nodes types up and running, you have a fully functional Druid Cluster and are ready to query your data!
# Querying Your Data #
Now that we have a complete cluster setup on localhost, we need to load data. To do so, refer to [Loading Your Data](Loading-Your-Data.html). Having done that, its time to query our data! For a complete specification of queries, see [Querying](Querying.html).
## Querying Different Nodes ##
As a shared-nothing system, there are three ways to query druid, against the [Realtime](Realtime.html), [Historical](Historical.html) or [Broker](Broker.html) node. Querying a Realtime node returns only realtime data, querying a historical node returns only historical segments. Querying the broker may query both realtime and historical segments and compose an overall result for the query. This is the normal mode of operation for queries in Druid.
### Construct a Query ###
For constructing this query, see: Querying against the realtime.spec
```json
{
"queryType": "groupBy",
"dataSource": "druidtest",
"granularity": "all",
"dimensions": [],
"aggregations": [
{"type": "count", "name": "rows"},
{"type": "longSum", "name": "imps", "fieldName": "impressions"},
{"type": "doubleSum", "name": "wp", "fieldName": "wp"}
],
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
}
```
### Querying the Realtime Node ###
Run our query against port 8080:
```bash
curl -X POST "http://localhost:8080/druid/v2/?pretty" -H 'content-type: application/json' -d @query.body
```
See our result:
```json
[ {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : { "imps" : 5, "wp" : 15000.0, "rows" : 5 }
} ]
```
### Querying the Historical node ###
Run the query against port 8082:
```bash
curl -X POST "http://localhost:8082/druid/v2/?pretty" -H 'content-type: application/json' -d @query.body
```
And get (similar to):
```json
[ {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : { "imps" : 27, "wp" : 77000.0, "rows" : 9 }
} ]
```
### Querying the Broker ###
Run the query against port 8083:
```bash
curl -X POST "http://localhost:8083/druid/v2/?pretty" -H 'content-type: application/json' -d @query.body
```
And get:
```json
[ {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : { "imps" : 5, "wp" : 15000.0, "rows" : 5 }
} ]
```
Now that we know what nodes can be queried (although you should usually use the broker node), lets learn how to know what queries are available.
## Examining the realtime.spec ##
How are we to know what queries we can run? Although [Querying](Querying.html) is a helpful index, to get a handle on querying our data we need to look at our [Realtime](Realtime.html) node's realtime.spec file:
```json
[
{
"schema": {
"dataSource": "druidtest",
"aggregators": [
{
"type": "count",
"name": "impressions"
},
{
"type": "doubleSum",
"name": "wp",
"fieldName": "wp"
}
],
"indexGranularity": "minute",
"shardSpec": {
"type": "none"
}
},
"config": {
"maxRowsInMemory": 500000,
"intermediatePersistPeriod": "PT10m"
},
"firehose": {
"type": "kafka-0.7.2",
"consumerProps": {
"zk.connect": "localhost:2181",
"zk.connectiontimeout.ms": "15000",
"zk.sessiontimeout.ms": "15000",
"zk.synctime.ms": "5000",
"groupid": "topic-pixel-local",
"fetch.size": "1048586",
"autooffset.reset": "largest",
"autocommit.enable": "false"
},
"feed": "druidtest",
"parser": {
"timestampSpec": {
"column": "utcdt",
"format": "iso"
},
"data": {
"format": "json"
},
"dimensionExclusions": [
"wp"
]
}
},
"plumber": {
"type": "realtime",
"windowPeriod": "PT10m",
"segmentGranularity": "hour",
"basePersistDirectory": "\/tmp\/realtime\/basePersist",
"rejectionPolicy": {
"type": "messageTime"
}
}
}
]
```
### dataSource ###
```json
"dataSource":"druidtest"
```
Our dataSource tells us the name of the relation/table, or 'source of data', to query in both our realtime.spec and query.body!
### aggregations ###
Note the [Aggregations](Aggregations.html) in our query:
```json
"aggregations": [
{"type": "count", "name": "rows"},
{"type": "longSum", "name": "imps", "fieldName": "impressions"},
{"type": "doubleSum", "name": "wp", "fieldName": "wp"}
],
```
this matches up to the aggregators in the schema of our realtime.spec!
```json
"aggregators":[ {"type":"count", "name":"impressions"},
{"type":"doubleSum","name":"wp","fieldName":"wp"}],
```
### dimensions ###
Lets look back at our actual records (from [Loading Your Data](Loading Your Data.html)):
```json
{"utcdt": "2010-01-01T01:01:01", "wp": 1000, "gender": "male", "age": 100}
{"utcdt": "2010-01-01T01:01:02", "wp": 2000, "gender": "female", "age": 50}
{"utcdt": "2010-01-01T01:01:03", "wp": 3000, "gender": "male", "age": 20}
{"utcdt": "2010-01-01T01:01:04", "wp": 4000, "gender": "female", "age": 30}
{"utcdt": "2010-01-01T01:01:05", "wp": 5000, "gender": "male", "age": 40}
```
Note that we have two dimensions to our data, other than our primary metric, wp. They are 'gender' and 'age'. We can specify these in our query! Note that we have added a dimension: age, below.
```json
{
"queryType": "groupBy",
"dataSource": "druidtest",
"granularity": "all",
"dimensions": ["age"],
"aggregations": [
{"type": "count", "name": "rows"},
{"type": "longSum", "name": "imps", "fieldName": "impressions"},
{"type": "doubleSum", "name": "wp", "fieldName": "wp"}
],
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
}
```
Which gets us grouped data in return!
```json
[ {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : { "imps" : 1, "age" : "100", "wp" : 1000.0, "rows" : 1 }
}, {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : { "imps" : 1, "age" : "20", "wp" : 3000.0, "rows" : 1 }
}, {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : { "imps" : 1, "age" : "30", "wp" : 4000.0, "rows" : 1 }
}, {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : { "imps" : 1, "age" : "40", "wp" : 5000.0, "rows" : 1 }
}, {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : { "imps" : 1, "age" : "50", "wp" : 2000.0, "rows" : 1 }
} ]
```
### filtering ###
Now that we've observed our dimensions, we can also filter:
```json
{
"queryType": "groupBy",
"dataSource": "druidtest",
"granularity": "all",
"filter": { "type": "selector", "dimension": "gender", "value": "male" },
"aggregations": [
{"type": "count", "name": "rows"},
{"type": "longSum", "name": "imps", "fieldName": "impressions"},
{"type": "doubleSum", "name": "wp", "fieldName": "wp"}
],
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
}
```
Which gets us just people aged 40:
```json
[ {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : { "imps" : 3, "wp" : 9000.0, "rows" : 3 }
} ]
```
Check out [Filters](Filters.html) for more information.
## Learn More ##
You can learn more about querying at [Querying](Querying.html)! Now check out [Booting a production cluster](Booting-a-production-cluster.html)!
......@@ -12,7 +12,7 @@ Segment Creation Tasks
The Index Task is a simpler variation of the Index Hadoop task that is designed to be used for smaller data sets. The task executes within the indexing service and does not require an external Hadoop setup to use. The grammar of the index task is as follows:
```
```json
{
"type" : "index",
"dataSource" : "example",
......@@ -50,7 +50,7 @@ The Index Task is a simpler variation of the Index Hadoop task that is designed
|--------|-----------|---------|
|type|The task type, this should always be "index".|yes|
|id|The task ID.|no|
|granularitySpec|See [granularitySpec](Tasks.html#Granularity-Spec)|yes|
|granularitySpec|See [granularitySpec](Tasks.html)|yes|
|spatialDimensions|Dimensions to build spatial indexes over. See [Spatial-Indexing](Spatial-Indexing.html)|no|
|aggregators|The metrics to aggregate in the data set. For more info, see [Aggregations](Aggregations.html)|yes|
|indexGranularity|The rollup granularity for timestamps.|no|
......@@ -78,10 +78,10 @@ The Hadoop Index Task is used to index larger data sets that require the paralle
The indexing service can also run real-time tasks. These tasks effectively transform a middle manager into a real-time node. We introduced real-time tasks as a way to programmatically add new real-time data sources without needing to manually add nodes. The grammar for the real-time task is as follows:
```
```json
{
"type" : "index_realtime",
"id": "example,
"id": "example",
"resource": {
"availabilityGroup" : "someGroup",
"requiredCapacity" : 1
......@@ -154,10 +154,10 @@ A JSON object used for high availability purposes. Not required.
|requiredCapacity|Integer|How much middle manager capacity this task will take.|yes|
Schema:
See [Schema](Realtime.html#Schema).
See [Schema](Realtime.html).
Fire Department Config:
See [Config](Realtime.html#Config).
See [Config](Realtime.html).
Firehose:
See [Firehose](Firehose.html).
......@@ -178,7 +178,7 @@ Segment Merging Tasks
Append tasks append a list of segments together into a single segment (one after the other). The grammar is:
```
```json
{
"id": <task_id>,
"dataSource": <task_datasource>,
......@@ -190,7 +190,7 @@ Append tasks append a list of segments together into a single segment (one after
Merge tasks merge a list of segments together. Any common timestamps are merged. The grammar is:
```
```json
{
"id": <task_id>,
"dataSource": <task_datasource>,
......@@ -205,7 +205,7 @@ Segment Destroying Tasks
Delete tasks create empty segments with no data. The grammar is:
```
```json
{
"id": <task_id>,
"dataSource": <task_datasource>,
......@@ -217,7 +217,7 @@ Delete tasks create empty segments with no data. The grammar is:
Kill tasks delete all information about a segment and removes it from deep storage. Killable segments must be disabled (used==0) in the Druid segment table. The available grammar is:
```
```json
{
"id": <task_id>,
"dataSource": <task_datasource>,
......@@ -232,7 +232,7 @@ Misc. Tasks
These tasks convert segments from an existing older index version to the latest index version. The available grammar is:
```
```json
{
"id": <task_id>,
"groupId" : <task_group_id>,
......@@ -246,7 +246,7 @@ These tasks convert segments from an existing older index version to the latest
These tasks start, sleep for a time and are used only for testing. The available grammar is:
```
```json
{
"id": <optional_task_id>,
"interval" : <optional_segment_interval>,
......
......@@ -43,12 +43,11 @@ These metrics track the number of characters added, deleted, and changed.
Setting Up
----------
There are two ways to setup Druid: download a tarball, or [Build From Source](Build From Source.html). You only need to do one of these.
There are two ways to setup Druid: download a tarball, or [Build From Source](Build-from-source.html). You only need to do one of these.
### Download a Tarball
We've built a tarball that contains everything you'll need. You'll find it [here](http://static.druid.io/artifacts/releases/druid-services-0.6.0-bin.tar.gz)
Download this file to a directory of your choosing.
We've built a tarball that contains everything you'll need. You'll find it [here](http://static.druid.io/artifacts/releases/druid-services-0.6.0-bin.tar.gz). Download this file to a directory of your choosing.
You can extract the awesomeness within by issuing:
......@@ -98,7 +97,7 @@ Okay, things are about to get real-time. To query the real-time node you've spun
./run_example_client.sh
```
Select "wikipedia" once again. This script issues [GroupByQuery](GroupByQuery.html)s to the data we've been ingesting. The query looks like this:
Select "wikipedia" once again. This script issues [GroupByQueries](GroupByQuery.html) to the data we've been ingesting. The query looks like this:
```json
{
......@@ -108,7 +107,7 @@ Select "wikipedia" once again. This script issues [GroupByQuery](GroupByQuery.ht
"dimensions":[ "page" ],
"aggregations":[
{"type":"count", "name":"rows"},
{"type":"longSum", "fieldName":"edit_count", "name":"count"}
{"type":"longSum", "fieldName":"count", "name":"edit_count"}
],
"filter":{ "type":"selector", "dimension":"namespace", "value":"article" },
"intervals":[ "2013-06-01T00:00/2020-01-01T00" ]
......@@ -151,7 +150,7 @@ time_boundary_query.body
Druid queries are JSON blobs which are relatively painless to create programmatically, but an absolute pain to write by hand. So anyway, we are going to create a Druid query by hand. Add the following to the file you just created:
```
```json
{
"queryType": "timeBoundary",
"dataSource": "wikipedia"
......@@ -186,7 +185,7 @@ timeseries_query.body
We are going to make a slightly more complicated query, the [TimeseriesQuery](TimeseriesQuery.html). Copy and paste the following into the file:
```
```json
{
"queryType": "timeseries",
"dataSource": "wikipedia",
......@@ -221,7 +220,7 @@ Right now all the results you are getting back are being aggregated into a singl
If you loudly exclaimed "we can change granularity to minute", you are absolutely correct! We can specify different granularities to bucket our results, like so:
```
```json
{
"queryType": "timeseries",
"dataSource": "wikipedia",
......@@ -267,7 +266,7 @@ group_by_query.body
and put the following in there:
```
```json
{
"queryType": "groupBy",
"dataSource": "wikipedia",
......@@ -321,13 +320,13 @@ Feel free to tweak other query parameters to answer other questions you may have
Next Steps
----------
What to know even more information about the Druid Cluster? Check out [Tutorial: The Druid Cluster](Tutorial:-The-Druid-Cluster.html)
What to know even more information about the Druid Cluster? Check out [Tutorial%3A The Druid Cluster](Tutorial%3A-The-Druid-Cluster.html)
Druid is even more fun if you load your own data into it! To learn how to load your data, see [Loading Your Data](Loading-Your-Data.html).
Druid is even more fun if you load your own data into it! To learn how to load your data, see [Loading Your Data](Tutorial%3A-Loading-Your-Data-Part-1.html).
Additional Information
----------------------
This tutorial is merely showcasing a small fraction of what Druid can do. If you are interested in more information about Druid, including setting up a more sophisticated Druid cluster, please read the other links in our wiki.
And thus concludes our journey! Hopefully you learned a thing or two about Druid real-time ingestion, querying Druid, and how Druid can be used to solve problems. If you have additional questions, feel free to post in our [google groups page](http://www.groups.google.com/forum/#!forum/druid-development).
And thus concludes our journey! Hopefully you learned a thing or two about Druid real-time ingestion, querying Druid, and how Druid can be used to solve problems. If you have additional questions, feel free to post in our [google groups page](https://groups.google.com/forum/#!forum/druid-development).
---
layout: doc_page
---
Hello! This tutorial is meant to provide a more in-depth look into Druid queries. The tutorial is somewhat incomplete right now but we hope to add more content to it in the near future.
Setup
-----
Before we start digging into how to query Druid, make sure you've gone through the other tutorials and are comfortable with spinning up a local cluster and loading data into Druid.
#### Booting a Druid Cluster
Let's start up a simple Druid cluster so we can query all the things.
To start a Coordinator node:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/coordinator io.druid.cli.Main server coordinator
```
To start a Historical node:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/historical io.druid.cli.Main server historical
```
To start a Broker node:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/broker io.druid.cli.Main server broker
```
Querying Your Data
------------------
Make sure you've completed [Loading Your Data](Loading-Your-Data-Part-1.html) so we have some data to query. Having done that, it's time to query our data! For a complete specification of queries, see [Querying](Querying.html).
#### Construct a Query
```json
{
"queryType": "groupBy",
"dataSource": "wikipedia",
"granularity": "all",
"dimensions": [],
"aggregations": [
{"type": "count", "name": "rows"},
{"type": "longSum", "name": "edit_count", "fieldName": "count"},
{"type": "doubleSum", "name": "chars_added", "fieldName": "added"}
],
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
}
```
#### Query That Data
Run the query against your broker:
```bash
curl -X POST "http://localhost:8080/druid/v2/?pretty" -H 'Content-type: application/json' -d @query.body
```
And get:
```json
[ {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : {
"chars_added" : 1545.0,
"edit_count" : 5,
"rows" : 5
}
} ]
```
This result tells us that our query has 5 edits, and we have 5 rows of data as well. In those 5 edits, we have 1545 characters added.
#### What can I query for?
How are we to know what queries we can run? Although [Querying](Querying.html) is a helpful index, to get a handle on querying our data we need to look at our ingestion schema. There are a few particular fields we care about in the ingestion schema. All of these fields should in present in the real-time ingestion schema and the batch ingestion schema.
Datasource:
```json
"dataSource":"wikipedia"
```
Our dataSource tells us the name of the relation/table, or 'source of data'. What we decide to name our data source must match the data source we are going to be querying.
Granularity:
```json
"indexGranularity": "none",
```
Druid will roll up data at ingestion time unless the index/rollup granularity is specified as "none". Your query granularity cannot be lower than your index granularity.
Aggregators:
```json
"aggregators" : [{
"type" : "count",
"name" : "count"
}, {
"type" : "doubleSum",
"name" : "added",
"fieldName" : "added"
}, {
"type" : "doubleSum",
"name" : "deleted",
"fieldName" : "deleted"
}, {
"type" : "doubleSum",
"name" : "delta",
"fieldName" : "delta"
}]
```
The [Aggregations](Aggregations.html) specified at ingestion time correlated directly to the metrics that can be queried.
Dimensions:
```json
"dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"]
```
These specify the dimensions that we can filter our data on. If we added a dimension to our groupBy query, we get:
```json
{
"queryType": "groupBy",
"dataSource": "wikipedia",
"granularity": "all",
"dimensions": ["namespace"],
"aggregations": [
{"type": "longSum", "name": "edit_count", "fieldName": "count"},
{"type": "doubleSum", "name": "chars_added", "fieldName": "added"}
],
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
}
```
Which gets us data grouped over the namespace dimension in return!
```json
[ {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : {
"chars_added" : 180.0,
"edit_count" : 2,
"namespace" : "article"
}
}, {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : {
"chars_added" : 1365.0,
"edit_count" : 3,
"namespace" : "wikipedia"
}
} ]
```
Additionally,, we can also filter our query to narrow down our metric values:
```json
{
"queryType": "groupBy",
"dataSource": "wikipedia",
"granularity": "all",
"filter": { "type": "selector", "dimension": "namespace", "value": "article" },
"aggregations": [
{"type": "longSum", "name": "edit_count", "fieldName": "count"},
{"type": "doubleSum", "name": "chars_added", "fieldName": "added"}
],
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
}
```
Which gets us metrics about only those edits where the namespace is 'article':
```json
[ {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : {
"chars_added" : 180.0,
"edit_count" : 2
}
} ]
```
Check out [Filters](Filters.html) for more information.
## Learn More ##
You can learn more about querying at [Querying](Querying.html)! If you are ready to evaluate Druid more in depth, check out [Booting a production cluster](Booting-a-production-cluster.html)!
---
layout: doc_page
---
In our last [tutorial](Tutorial:-The-Druid-Cluster.html), we setup a complete Druid cluster. We created all the Druid dependencies and loaded some batched data. Druid shards data into self-contained chunks known as [segments](Segments.html). Segments are the fundamental unit of storage in Druid and all Druid nodes only understand segments.
In our last [tutorial](Tutorial%3A-The-Druid-Cluster.html), we setup a complete Druid cluster. We created all the Druid dependencies and loaded some batched data. Druid shards data into self-contained chunks known as [segments](Segments.html). Segments are the fundamental unit of storage in Druid and all Druid nodes only understand segments.
In this tutorial, we will learn about batch ingestion (as opposed to real-time ingestion) and how to create segments using the final piece of the Druid Cluster, the [indexing service](Indexing-Service.html). The indexing service is a standalone service that accepts [tasks](Tasks.html) in the form of POST requests. The output of most tasks are segments.
......@@ -50,12 +50,12 @@ examples/indexing/wikipedia_data.json
Open the file and make sure the following events exist:
```
```json
{"timestamp": "2013-08-31T01:02:33Z", "page": "Gypsy Danger", "language" : "en", "user" : "nuclear", "unpatrolled" : "true", "newPage" : "true", "robot": "false", "anonymous": "false", "namespace":"article", "continent":"North America", "country":"United States", "region":"Bay Area", "city":"San Francisco", "added": 57, "deleted": 200, "delta": -143}
{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Dingo Land", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330}
{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Vodka Land", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111}
{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Cantebury", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330}
{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Oblast", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111}
{"timestamp": "2013-08-31T11:58:39Z", "page": "Crimson Typhoon", "language" : "zh", "user" : "triplets", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"China", "region":"Shanxi", "city":"Taiyuan", "added": 905, "deleted": 5, "delta": 900}
{"timestamp": "2013-08-31T12:41:27Z", "page": "Coyote Tango", "language" : "ja", "user" : "cancer", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"Japan", "region":"Kanto", "city":"Tokyo", "added": 1, "deleted": 10, "delta": -9}
{"timestamp": "2013-08-31T12:41:27Z", "page": "Coyote Tango", "language" : "ja", "user" : "stringer", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"Japan", "region":"Kanto", "city":"Tokyo", "added": 1, "deleted": 10, "delta": -9}
```
There are five data points spread across the day of 2013-08-31. Talk about big data right? Thankfully, we don't need a ton of data to introduce how batch ingestion works.
......@@ -71,12 +71,14 @@ java -Xmx2g -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/ov
```
The overlord configurations should already exist in:
```
config/overlord/runtime.properties
```
The configurations for the overlord node are as follows:
```
```bash
druid.host=localhost
druid.port=8087
druid.service=overlord
......@@ -96,8 +98,9 @@ druid.indexer.fork.property.druid.computation.buffer.size=268435456
If you are interested in reading more about these configurations, see [here](Indexing-Service.html).
When the overlord node is ready for tasks, you should see a message like the following:
```
013-10-09 21:30:32,817 INFO [Thread-14] io.druid.indexing.overlord.TaskQueue - Waiting for work...
```bash
2013-10-09 21:30:32,817 INFO [Thread-14] io.druid.indexing.overlord.TaskQueue - Waiting for work...
```
#### Starting Other Nodes
......@@ -111,6 +114,7 @@ java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/
```
Historical node:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/historical io.druid.cli.Main server historical
```
......@@ -130,7 +134,7 @@ examples/indexing/index_task.json
Open up the file to see the following:
```
```json
{
"type" : "index",
"dataSource" : "wikipedia",
......@@ -141,7 +145,7 @@ Open up the file to see the following:
},
"aggregators" : [{
"type" : "count",
"name" : "edit_count"
"name" : "count"
}, {
"type" : "doubleSum",
"name" : "added",
......@@ -176,21 +180,21 @@ Okay, so what is happening here? The "type" field indicates the type of task we
Let's send our task to the indexing service now:
```
```bash
curl -X 'POST' -H 'Content-Type:application/json' -d @examples/indexing/wikipedia_index_task.json localhost:8087/druid/indexer/v1/task
```
Issuing the request should return a task ID like so:
```
fjy$ curl -X 'POST' -H 'Content-Type:application/json' -d @examples/indexing/wikipedia_index_task.json localhost:8087/druid/indexer/v1/task
```bash
$ curl -X 'POST' -H 'Content-Type:application/json' -d @examples/indexing/wikipedia_index_task.json localhost:8087/druid/indexer/v1/task
{"task":"index_wikipedia_2013-10-09T21:30:32.802Z"}
fjy$
$
```
In your indexing service logs, you should see the following:
````
```bash
2013-10-09 21:41:41,150 INFO [qtp300448720-21] io.druid.indexing.overlord.HeapMemoryTaskStorage - Inserting task index_wikipedia_2013-10-09T21:41:41.147Z with status: TaskStatus{id=index_wikipedia_2013-10-09T21:41:41.147Z, status=RUNNING, duration=-1}
2013-10-09 21:41:41,151 INFO [qtp300448720-21] io.druid.indexing.overlord.TaskLockbox - Created new TaskLockPosse: TaskLockPosse{taskLock=TaskLock{groupId=index_wikipedia_2013-10-09T21:41:41.147Z, dataSource=wikipedia, interval=2013-08-31T00:00:00.000Z/2013-09-01T00:00:00.000Z, version=2013-10-09T21:41:41.151Z}, taskIds=[]}
...
......@@ -201,7 +205,7 @@ In your indexing service logs, you should see the following:
After a few seconds, the task should complete and you should see in the indexing service logs:
```
```bash
2013-10-09 21:41:45,765 INFO [pool-6-thread-1] io.druid.indexing.overlord.exec.TaskConsumer - Received SUCCESS status for task: IndexGeneratorTask{id=index_wikipedia_2013-10-09T21:41:41.147Z_generator_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_0, type=index_generator, dataSource=wikipedia, interval=Optional.of(2013-08-31T00:00:00.000Z/2013-09-01T00:00:00.000Z)}
```
......@@ -209,7 +213,7 @@ Congratulations! The segment has completed building. Once a segment is built, a
You should see the following logs on the coordinator:
```
```bash
2013-10-09 21:41:54,368 INFO [Coordinator-Exec--0] io.druid.server.coordinator.DruidCoordinatorLogger - [_default_tier] : Assigned 1 segments among 1 servers
2013-10-09 21:41:54,369 INFO [Coordinator-Exec--0] io.druid.server.coordinator.DruidCoordinatorLogger - Load Queues:
2013-10-09 21:41:54,369 INFO [Coordinator-Exec--0] io.druid.server.coordinator.DruidCoordinatorLogger - Server[localhost:8081, historical, _default_tier] has 1 left to load, 0 left to drop, 4,477 bytes queued, 4,477 bytes served.
......@@ -217,7 +221,7 @@ You should see the following logs on the coordinator:
These logs indicate that the coordinator has assigned our new segment to the historical node to download and serve. If you look at the historical node logs, you should see:
```
```bash
2013-10-09 21:41:54,369 INFO [ZkCoordinator-0] io.druid.server.coordination.ZkCoordinator - Loading segment wikipedia_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_2013-10-09T21:41:41.151Z
2013-10-09 21:41:54,369 INFO [ZkCoordinator-0] io.druid.segment.loading.LocalDataSegmentPuller - Unzipping local file[/tmp/druid/localStorage/wikipedia/2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z/2013-10-09T21:41:41.151Z/0/index.zip] to [/tmp/druid/indexCache/wikipedia/2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z/2013-10-09T21:41:41.151Z/0]
2013-10-09 21:41:54,370 INFO [ZkCoordinator-0] io.druid.utils.CompressionUtils - Unzipping file[/tmp/druid/localStorage/wikipedia/2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z/2013-10-09T21:41:41.151Z/0/index.zip] to [/tmp/druid/indexCache/wikipedia/2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z/2013-10-09T21:41:41.151Z/0]
......@@ -228,7 +232,7 @@ Once the segment is announced the segment is queryable. Now you should be able t
Issuing a [TimeBoundaryQuery](TimeBoundaryQuery.html) should yield:
```
```json
[ {
"timestamp" : "2013-08-31T01:02:33.000Z",
"result" : {
......@@ -241,9 +245,9 @@ Issuing a [TimeBoundaryQuery](TimeBoundaryQuery.html) should yield:
Next Steps
----------
This tutorial covered ingesting a small batch data set and loading it into Druid. In [Loading Your Data Part 2](Tutorial-Loading-Your-Data-Part-2.html), we will cover how to ingest data using Hadoop for larger data sets.
This tutorial covered ingesting a small batch data set and loading it into Druid. In [Loading Your Data Part 2](Tutorial%3A-Loading-Your-Data-Part-2.html), we will cover how to ingest data using Hadoop for larger data sets.
Additional Information
----------------------
Getting data into Druid can definitely be difficult for first time users. Please don't hesitate to ask questions in our IRC channel or on our [google groups page](http://www.groups.google.com/forum/#!forum/druid-development).
Getting data into Druid can definitely be difficult for first time users. Please don't hesitate to ask questions in our IRC channel or on our [google groups page](https://groups.google.com/forum/#!forum/druid-development).
---
layout: doc_page
---
Welcome back! In our first [tutorial](Tutorial:-A-First-Look-at-Druid.html), we introduced you to the most basic Druid setup: a single realtime node. We streamed in some data and queried it. Realtime nodes collect very recent data and periodically hand that data off to the rest of the Druid cluster. Some questions about the architecture must naturally come to mind. What does the rest of Druid cluster look like? How does Druid load available static data?
Welcome back! In our first [tutorial](Tutorial%3A-A-First-Look-at-Druid.html), we introduced you to the most basic Druid setup: a single realtime node. We streamed in some data and queried it. Realtime nodes collect very recent data and periodically hand that data off to the rest of the Druid cluster. Some questions about the architecture must naturally come to mind. What does the rest of Druid cluster look like? How does Druid load available static data?
This tutorial will hopefully answer these questions!
In this tutorial, we will set up other types of Druid nodes as well as and external dependencies for a fully functional Druid cluster. The architecture of Druid is very much like the [Megazord](http://www.youtube.com/watch?v=7mQuHh1X4H4) from the popular 90s show Mighty Morphin' Power Rangers. Each Druid node has a specific purpose and the nodes come together to form a fully functional system.
## Downloading Druid ##
## Downloading Druid
If you followed the first tutorial, you should already have Druid downloaded. If not, let's go back and do that first.
......@@ -20,15 +20,15 @@ tar -zxvf druid-services-*-bin.tar.gz
cd druid-services-*
```
You can also [Build From Source](Build-From-Source.html).
You can also [Build From Source](Build-from-source.html).
## External Dependencies ##
## External Dependencies
Druid requires 3 external dependencies. A "deep" storage that acts as a backup data repository, a relational database such as MySQL to hold configuration and metadata information, and [Apache Zookeeper](http://zookeeper.apache.org/) for coordination among different pieces of the cluster.
For deep storage, we have made a public S3 bucket (static.druid.io) available where data for this particular tutorial can be downloaded. More on the data [later](Tutorial-Part-2.html#the-data).
For deep storage, we have made a public S3 bucket (static.druid.io) available where data for this particular tutorial can be downloaded. More on the data later.
### Setting up MySQL ###
#### Setting up MySQL
1. If you don't already have it, download MySQL Community Server here: [http://dev.mysql.com/downloads/mysql/](http://dev.mysql.com/downloads/mysql/)
2. Install MySQL
......@@ -43,7 +43,7 @@ GRANT ALL ON druid.* TO 'druid'@'localhost' IDENTIFIED BY 'diurd';
CREATE database druid;
```
### Setting up Zookeeper ###
#### Setting up Zookeeper
```bash
curl http://www.motorlogy.com/apache/zookeeper/zookeeper-3.4.5/zookeeper-3.4.5.tar.gz -o zookeeper-3.4.5.tar.gz
......@@ -54,9 +54,9 @@ cp conf/zoo_sample.cfg conf/zoo.cfg
cd ..
```
## The Data ##
## The Data
Similar to the first tutorial, the data we will be loading is based on edits that have occurred on Wikipedia. Every time someone edits a page in Wikipedia, metadata is generated about the editor and edited page. Druid collects each individual event and packages them together in a container known as a [segment](https://github.com/metamx/druid/wiki/Segments). Segments contain data over some span of time. We've prebuilt a segment for this tutorial and will cover making your own segments in other [pages](Loading-Your-Data.html).The segment we are going to work with has the following format:
Similar to the first tutorial, the data we will be loading is based on edits that have occurred on Wikipedia. Every time someone edits a page in Wikipedia, metadata is generated about the editor and edited page. Druid collects each individual event and packages them together in a container known as a [segment](Segments.html). Segments contain data over some span of time. We've prebuilt a segment for this tutorial and will cover making your own segments in other [pages](Tutorial%3A-Loading-Your-Data-Part-1.html).The segment we are going to work with has the following format:
Dimensions (things to filter on):
......@@ -84,28 +84,28 @@ Metrics (things to aggregate over):
"deleted"
```
## The Cluster ##
## The Cluster
Let's start up a few nodes and download our data. First things though, let's create a config directory where we will store configs for our various nodes:
Let's start up a few nodes and download our data. First things though, let's make sure we have config directory where we will store configs for our various nodes:
```
mkdir config
ls config
```
If you are interested in learning more about Druid configuration files, check out this [link](Configuration.html). Many aspects of Druid are customizable. For the purposes of this tutorial, we are going to use default values for most things.
### Start a Coordinator Node ###
#### Start a Coordinator Node
Coordinator nodes are in charge of load assignment and distribution. Coordinator nodes monitor the status of the cluster and command historical nodes to assign and drop segments.
For more information about coordinator nodes, see [here](Coordinator.html).
To create the coordinator config file:
The coordinator config file should already exist at:
```
mkdir config/coordinator
config/coordinator
```
Under the directory we just created, create the file `runtime.properties` with the following contents if it does not exist:
In the directory, there should be a `runtime.properties` file with the following contents:
```
druid.host=localhost
......@@ -130,18 +130,18 @@ To start the coordinator node:
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/coordinator io.druid.cli.Main server coordinator
```
### Start a historical node ###
#### Start a Historical Node
Historical nodes are the workhorses of a cluster and are in charge of loading historical segments and making them available for queries. Our Wikipedia segment will be downloaded by a historical node.
For more information about Historical nodes, see [here](Historical.html).
To create the historical config file:
The historical config file should exist at:
```
mkdir config/historical
config/historical
```
Under the directory we just created, create the file `runtime.properties` with the following contents:
In the directory we just created, we should have the file `runtime.properties` with the following contents:
```
druid.host=localhost
......@@ -167,18 +167,18 @@ To start the historical node:
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/historical io.druid.cli.Main server historical
```
### Start a Broker Node ###
#### Start a Broker Node
Broker nodes are responsible for figuring out which historical and/or realtime nodes correspond to which queries. They also merge partial results from these nodes in a scatter/gather fashion.
For more information about Broker nodes, see [here](Broker.html).
To create the broker config file:
The broker config file should exist at:
```
mkdir config/broker
config/broker
```
Under the directory we just created, create the file ```runtime.properties``` with the following contents:
In the directory, there should be a `runtime.properties` file with the following contents:
```
druid.host=localhost
......@@ -194,7 +194,7 @@ To start the broker node:
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/broker io.druid.cli.Main server broker
```
## Loading the Data ##
## Loading the Data
The MySQL dependency we introduced earlier on contains a 'segments' table that contains entries for segments that should be loaded into our cluster. The Druid coordinator compares this table with segments that already exist in the cluster to determine what should be loaded and dropped. To load our wikipedia segment, we need to create an entry in our MySQL segment table.
......@@ -220,7 +220,8 @@ When the segment completes downloading and ready for queries, you should see the
At this point, we can query the segment. For more information on querying, see this [link](Querying.html).
## Next Steps ##
Next Steps
----------
Now that you have an understanding of what the Druid clsuter looks like, why not load some of your own data?
Check out the [Loading Your Own Data](Loading-Your-Data.html) section for more info!
Now that you have an understanding of what the Druid cluster looks like, why not load some of your own data?
Check out the next [tutorial](Tutorial%3A-Loading-Your-Data-Part-1.html) section for more info!
\ No newline at end of file
......@@ -80,7 +80,7 @@ Okay, things are about to get real. To query the real-time node you've spun up,
./run_example_client.sh
```
Select "webstream" once again. This script issues [GroupByQuery](GroupByQuery.html)s to the data we've been ingesting. The query looks like this:
Select "webstream" once again. This script issues [GroupByQueries](GroupByQuery.html) to the data we've been ingesting. The query looks like this:
```json
{
......@@ -304,15 +304,9 @@ You should see an answer to our question. For my stream, it looks like this:
Feel free to tweak other query parameters to answer other questions you may have about the data.
Next Steps
----------
What to know even more information about the Druid Cluster? Check out [Tutorial: The Druid Cluster](Tutorial:-The-Druid-Cluster.html)
Druid is even more fun if you load your own data into it! To learn how to load your data, see [Loading Your Data](Loading-Your-Data.html).
Additional Information
----------------------
This tutorial is merely showcasing a small fraction of what Druid can do. If you are interested in more information about Druid, including setting up a more sophisticated Druid cluster, please read the other links in our wiki.
And thus concludes our journey! Hopefully you learned a thing or two about Druid real-time ingestion, querying Druid, and how Druid can be used to solve problems. If you have additional questions, feel free to post in our [google groups page](http://www.groups.google.com/forum/#!forum/druid-development).
And thus concludes our journey! Hopefully you learned a thing or two about Druid real-time ingestion, querying Druid, and how Druid can be used to solve problems. If you have additional questions, feel free to post in our [google groups page](https://groups.google.com/forum/#!forum/druid-development).
......@@ -322,6 +322,6 @@ Feel free to tweak other query parameters to answer other questions you may have
h2. Additional Information
This tutorial is merely showcasing a small fraction of what Druid can do. Next, continue on to "Loading Your Data":./Loading-Your-Data.html.
This tutorial is merely showcasing a small fraction of what Druid can do. Next, continue on to "The Druid Cluster":./Tutorial:-The-Druid-Cluster.html.
And thus concludes our journey! Hopefully you learned a thing or two about Druid real-time ingestion, querying Druid, and how Druid can be used to solve problems. If you have additional questions, feel free to post in our "google groups page":http://www.groups.google.com/forum/#!forum/druid-development.
......@@ -10,18 +10,27 @@
Getting Started
* [Tutorial: A First Look at Druid](Tutorial:-A-First-Look-at-Druid.html)
* [Tutorial: The Druid Cluster](Tutorial:-The-Druid-Cluster.html)
* [Loading Your Data](Loading-Your-Data.html)
* [Querying Your Data](Querying-Your-Data.html)
* [Booting a Production Cluster](Booting-a-Production-Cluster.html)
* [Examples](Examples.html)
* [Cluster Setup](Cluster-Setup.html)
* [Configuration](Configuration.html)
* [Tutorial: Loading Your Data Part 1](Tutorial:-Loading-Your-Data-Part-1.html)
* [Tutorial: Loading Your Data Part 2](Tutorial:-Loading-Your-Data-Part-2.html)
* [Tutorial: All About Queries](Tutorial:-All-About-Queries.html)
--------------------------------------
Evaluate Druid
* [Cluster Setup](Cluster-setup.html)
* [Booting a Production Cluster](Booting-a-production-cluster.html)
--------------------------------------
Configuration
* [Configuration](Configuration.html)
-------------------------------------
Data Ingestion
* [Realtime](Realtime.html)
* [Batch|Batch Ingestion](Batch|Batch-Ingestion.html)
* [Batch Ingestion](Batch-ingestion.html)
* [Indexing Service](Indexing-Service.html)
* [Indexing Service](Indexing-Service.html)
*** ]
*** [Tasks](Tasks.html)
----------------------------
Querying
......
......@@ -12,16 +12,22 @@ h1. Contents
h2. Getting Started
* "Tutorial: A First Look at Druid":./Tutorial:-A-First-Look-at-Druid.html
* "Tutorial: The Druid Cluster":./Tutorial:-The-Druid-Cluster.html
* "Loading Your Data":./Loading-Your-Data.html
* "Querying Your Data":./Querying-your-data.html
* "Tutorial: Loading Your Data Part 1":./Tutorial:-Loading-Your-Data-Part-1.html
* "Tutorial: Loading Your Data Part 2":./Tutorial:-Loading-Your-Data-Part-2.html
* "Tutorial: All About Queries":./Tutorial:-All-About-Queries.html
h2. Evaluate Druid
* "Cluster Setup":./Cluster-setup.html
* "Booting a Production Cluster":./Booting-a-production-cluster.html
* "Examples":./Examples.html
h2. Configuration
* "Configuration":Configuration.html
h2. Data Ingestion
* "Realtime":./Realtime.html
* "Batch":./Batch-ingestion.html
* "Indexing Service":./Indexing-Service.html
** "Tasks":./Tasks.html
h2. Querying
* "Querying":./Querying.html
......
[
{
"schema": {
"dataSource": "druidtest",
"aggregators": [
{
"type": "count",
"name": "impressions"
},
{
"type": "doubleSum",
"name": "wp",
"fieldName": "wp"
}
],
"indexGranularity": "minute",
"shardSpec": {
"type": "none"
}
"dataSource": "wikipedia",
"aggregators" : [{
"type" : "count",
"name" : "count"
}, {
"type" : "doubleSum",
"name" : "added",
"fieldName" : "added"
}, {
"type" : "doubleSum",
"name" : "deleted",
"fieldName" : "deleted"
}, {
"type" : "doubleSum",
"name" : "delta",
"fieldName" : "delta"
}],
"indexGranularity": "none"
},
"config": {
"maxRowsInMemory": 500000,
......@@ -29,23 +31,20 @@
"zk.connectiontimeout.ms": "15000",
"zk.sessiontimeout.ms": "15000",
"zk.synctime.ms": "5000",
"groupid": "topic-pixel-local",
"groupid": "druid-example",
"fetch.size": "1048586",
"autooffset.reset": "largest",
"autocommit.enable": "false"
},
"feed": "druidtest",
"feed": "wikipedia",
"parser": {
"timestampSpec": {
"column": "utcdt",
"format": "iso"
"column": "timestamp"
},
"data": {
"format": "json"
},
"dimensionExclusions": [
"wp"
]
"format": "json",
"dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"]
}
}
},
"plumber": {
......
{"timestamp": "2013-08-31T01:02:33Z", "page": "Gypsy Danger", "language" : "en", "user" : "nuclear", "unpatrolled" : "true", "newPage" : "true", "robot": "false", "anonymous": "false", "namespace":"article", "continent":"North America", "country":"United States", "region":"Bay Area", "city":"San Francisco", "added": 57, "deleted": 200, "delta": -143}
{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Dingo Land", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330}
{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Vodka Land", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111}
{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Cantebury", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330}
{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Oblast", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111}
{"timestamp": "2013-08-31T11:58:39Z", "page": "Crimson Typhoon", "language" : "zh", "user" : "triplets", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"China", "region":"Shanxi", "city":"Taiyuan", "added": 905, "deleted": 5, "delta": 900}
{"timestamp": "2013-08-31T12:41:27Z", "page": "Coyote Tango", "language" : "ja", "user" : "cancer", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"Japan", "region":"Kanto", "city":"Tokyo", "added": 1, "deleted": 10, "delta": -9}
\ No newline at end of file
{
"dataSource": "wikipedia",
"timestampColumn": "timestamp",
"timestampFormat": "iso",
"dataSpec": {
"format": "json",
"dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"]
},
"granularitySpec" : {
"type" : "uniform",
"gran" : "DAY",
"intervals" : [ "2013-08-31/2013-09-01" ]
},
"pathSpec": {
"type": "static",
"paths": "examples/indexing/wikipedia_data.json"
},
"rollupSpec": {
"aggs": [{
"type" : "count",
"name" : "count"
}, {
"type" : "doubleSum",
"name" : "added",
"fieldName" : "added"
}, {
"type" : "doubleSum",
"name" : "deleted",
"fieldName" : "deleted"
}, {
"type" : "doubleSum",
"name" : "delta",
"fieldName" : "delta"
}],
"rollupGranularity": "none"
},
"workingPath": "\/tmp\/working_path",
"segmentOutputPath": "\/tmp\/segments",
"partitionsSpec": {
"targetPartitionSize": 5000000
},
"updaterJobSpec": {
"type": "db",
"connectURI": "jdbc:mysql:\/\/localhost:3306\/druid",
"user": "druid",
"password": "diurd",
"segmentTable": "druid_segments"
}
}
\ No newline at end of file
{
"type" : "index_hadoop",
"config": {
"dataSource" : "wikipedia",
"timestampColumn" : "timestamp",
"timestampFormat" : "auto",
"dataSpec" : {
"format" : "json",
"dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"]
},
"granularitySpec" : {
"type" : "uniform",
"gran" : "DAY",
"intervals" : [ "2013-08-31/2013-09-01" ]
},
"pathSpec" : {
"type" : "static",
"paths" : "examples/indexing/wikipedia_data.json"
},
"targetPartitionSize" : 5000000,
"rollupSpec" : {
"aggs": [{
"type" : "count",
"name" : "count"
}, {
"type" : "doubleSum",
"name" : "added",
"fieldName" : "added"
}, {
"type" : "doubleSum",
"name" : "deleted",
"fieldName" : "deleted"
}, {
"type" : "doubleSum",
"name" : "delta",
"fieldName" : "delta"
}],
"rollupGranularity" : "none"
}
}
}
\ No newline at end of file
......@@ -8,7 +8,7 @@
},
"aggregators" : [{
"type" : "count",
"name" : "edit_count"
"name" : "count"
}, {
"type" : "doubleSum",
"name" : "added",
......
......@@ -12,8 +12,8 @@
},
{
"type":"longSum",
"fieldName":"edit_count",
"name":"count"
"fieldName":"count",
"name":"edit_count"
}
],
"filter":{
......
......@@ -11,6 +11,4 @@ druid.db.connector.connectURI=jdbc\:mysql\://localhost\:3306/druid
druid.db.connector.user=druid
druid.db.connector.password=diurd
druid.realtime.specFile=config/realtime/realtime.spec
druid.processing.buffer.sizeBytes=10000000
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册