Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
kvdb
rocksdb
提交
d79762e2
R
rocksdb
项目概览
kvdb
/
rocksdb
11 个月 前同步成功
通知
0
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
R
rocksdb
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
d79762e2
编写于
3月 08, 2012
作者:
S
Sanjay Ghemawat
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
added group commit; drastically speeds up mult-threaded synchronous write workloads
上级
015d26f8
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
162 addition
and
49 deletion
+162
-49
db/db_impl.cc
db/db_impl.cc
+107
-36
db/db_impl.h
db/db_impl.h
+9
-9
db/write_batch.cc
db/write_batch.cc
+13
-4
db/write_batch_internal.h
db/write_batch_internal.h
+2
-0
db/write_batch_test.cc
db/write_batch_test.cc
+31
-0
未找到文件。
db/db_impl.cc
浏览文件 @
d79762e2
...
...
@@ -35,6 +35,17 @@
namespace
leveldb
{
// Information kept for every waiting writer
struct
DBImpl
::
Writer
{
Status
status
;
WriteBatch
*
batch
;
bool
sync
;
bool
done
;
port
::
CondVar
cv
;
explicit
Writer
(
port
::
Mutex
*
mu
)
:
cv
(
mu
)
{
}
};
struct
DBImpl
::
CompactionState
{
Compaction
*
const
compaction
;
...
...
@@ -113,8 +124,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
logfile_
(
NULL
),
logfile_number_
(
0
),
log_
(
NULL
),
logger_
(
NULL
),
logger_cv_
(
&
mutex_
),
tmp_batch_
(
new
WriteBatch
),
bg_compaction_scheduled_
(
false
),
manual_compaction_
(
NULL
)
{
mem_
->
Ref
();
...
...
@@ -144,6 +154,7 @@ DBImpl::~DBImpl() {
delete
versions_
;
if
(
mem_
!=
NULL
)
mem_
->
Unref
();
if
(
imm_
!=
NULL
)
imm_
->
Unref
();
delete
tmp_batch_
;
delete
log_
;
delete
logfile_
;
delete
table_cache_
;
...
...
@@ -554,13 +565,11 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
}
Status
DBImpl
::
TEST_CompactMemTable
()
{
MutexLock
l
(
&
mutex_
);
LoggerId
self
;
AcquireLoggingResponsibility
(
&
self
);
Status
s
=
MakeRoomForWrite
(
true
/* force compaction */
);
ReleaseLoggingResponsibility
(
&
self
);
// NULL batch means just wait for earlier writes to be done
Status
s
=
Write
(
WriteOptions
(),
NULL
);
if
(
s
.
ok
())
{
// Wait until the compaction completes
MutexLock
l
(
&
mutex_
);
while
(
imm_
!=
NULL
&&
bg_error_
.
ok
())
{
bg_cv_
.
Wait
();
}
...
...
@@ -1094,38 +1103,35 @@ Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
return
DB
::
Delete
(
options
,
key
);
}
// There is at most one thread that is the current logger. This call
// waits until preceding logger(s) have finished and becomes the
// current logger.
void
DBImpl
::
AcquireLoggingResponsibility
(
LoggerId
*
self
)
{
while
(
logger_
!=
NULL
)
{
logger_cv_
.
Wait
();
}
logger_
=
self
;
}
Status
DBImpl
::
Write
(
const
WriteOptions
&
options
,
WriteBatch
*
my_batch
)
{
Writer
w
(
&
mutex_
);
w
.
batch
=
my_batch
;
w
.
sync
=
options
.
sync
;
w
.
done
=
false
;
void
DBImpl
::
ReleaseLoggingResponsibility
(
LoggerId
*
self
)
{
assert
(
logger_
==
self
);
logger_
=
NULL
;
logger_cv_
.
SignalAll
();
}
Status
DBImpl
::
Write
(
const
WriteOptions
&
options
,
WriteBatch
*
updates
)
{
Status
status
;
MutexLock
l
(
&
mutex_
);
LoggerId
self
;
AcquireLoggingResponsibility
(
&
self
);
status
=
MakeRoomForWrite
(
false
);
// May temporarily release lock and wait
writers_
.
push_back
(
&
w
);
while
(
!
w
.
done
&&
&
w
!=
writers_
.
front
())
{
w
.
cv
.
Wait
();
}
if
(
w
.
done
)
{
return
w
.
status
;
}
// May temporarily unlock and wait.
Status
status
=
MakeRoomForWrite
(
my_batch
==
NULL
);
uint64_t
last_sequence
=
versions_
->
LastSequence
();
if
(
status
.
ok
())
{
Writer
*
last_writer
=
&
w
;
if
(
status
.
ok
()
&&
my_batch
!=
NULL
)
{
// NULL batch is for compactions
WriteBatch
*
updates
=
BuildBatchGroup
(
&
last_writer
);
WriteBatchInternal
::
SetSequence
(
updates
,
last_sequence
+
1
);
last_sequence
+=
WriteBatchInternal
::
Count
(
updates
);
// Add to log and apply to memtable. We can release the lock during
// this phase since the "logger_" flag protects against concurrent
// loggers and concurrent writes into mem_.
// Add to log and apply to memtable. We can release the lock
// during this phase since &w is currently responsible for logging
// and protects against concurrent loggers and concurrent writes
// into mem_.
{
assert
(
logger_
==
&
self
);
mutex_
.
Unlock
();
status
=
log_
->
AddRecord
(
WriteBatchInternal
::
Contents
(
updates
));
if
(
status
.
ok
()
&&
options
.
sync
)
{
...
...
@@ -1135,20 +1141,85 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
status
=
WriteBatchInternal
::
InsertInto
(
updates
,
mem_
);
}
mutex_
.
Lock
();
assert
(
logger_
==
&
self
);
}
if
(
updates
==
tmp_batch_
)
tmp_batch_
->
Clear
();
versions_
->
SetLastSequence
(
last_sequence
);
}
ReleaseLoggingResponsibility
(
&
self
);
while
(
true
)
{
Writer
*
ready
=
writers_
.
front
();
writers_
.
pop_front
();
if
(
ready
!=
&
w
)
{
ready
->
status
=
status
;
ready
->
done
=
true
;
ready
->
cv
.
Signal
();
}
if
(
ready
==
last_writer
)
break
;
}
// Notify new head of write queue
if
(
!
writers_
.
empty
())
{
writers_
.
front
()
->
cv
.
Signal
();
}
return
status
;
}
// REQUIRES: Writer list must be non-empty
// REQUIRES: First writer must have a non-NULL batch
WriteBatch
*
DBImpl
::
BuildBatchGroup
(
Writer
**
last_writer
)
{
assert
(
!
writers_
.
empty
());
Writer
*
first
=
writers_
.
front
();
WriteBatch
*
result
=
first
->
batch
;
assert
(
result
!=
NULL
);
size_t
size
=
WriteBatchInternal
::
ByteSize
(
first
->
batch
);
// Allow the group to grow up to a maximum size, but if the
// original write is small, limit the growth so we do not slow
// down the small write too much.
size_t
max_size
=
1
<<
20
;
if
(
size
<=
(
128
<<
10
))
{
max_size
=
size
+
(
128
<<
10
);
}
*
last_writer
=
first
;
std
::
deque
<
Writer
*>::
iterator
iter
=
writers_
.
begin
();
++
iter
;
// Advance past "first"
for
(;
iter
!=
writers_
.
end
();
++
iter
)
{
Writer
*
w
=
*
iter
;
if
(
w
->
sync
&&
!
first
->
sync
)
{
// Do not include a sync write into a batch handled by a non-sync write.
break
;
}
if
(
w
->
batch
!=
NULL
)
{
size
+=
WriteBatchInternal
::
ByteSize
(
w
->
batch
);
if
(
size
>
max_size
)
{
// Do not make batch too big
break
;
}
// Append to *reuslt
if
(
result
==
first
->
batch
)
{
// Switch to temporary batch instead of disturbing caller's batch
result
=
tmp_batch_
;
assert
(
WriteBatchInternal
::
Count
(
result
)
==
0
);
WriteBatchInternal
::
Append
(
result
,
first
->
batch
);
}
WriteBatchInternal
::
Append
(
result
,
w
->
batch
);
}
*
last_writer
=
w
;
}
return
result
;
}
// REQUIRES: mutex_ is held
// REQUIRES: this thread is
the current logger
// REQUIRES: this thread is
currently at the front of the writer queue
Status
DBImpl
::
MakeRoomForWrite
(
bool
force
)
{
mutex_
.
AssertHeld
();
assert
(
logger_
!=
NULL
);
assert
(
!
writers_
.
empty
()
);
bool
allow_delay
=
!
force
;
Status
s
;
while
(
true
)
{
...
...
db/db_impl.h
浏览文件 @
d79762e2
...
...
@@ -5,6 +5,7 @@
#ifndef STORAGE_LEVELDB_DB_DB_IMPL_H_
#define STORAGE_LEVELDB_DB_DB_IMPL_H_
#include <deque>
#include <set>
#include "db/dbformat.h"
#include "db/log_writer.h"
...
...
@@ -59,6 +60,8 @@ class DBImpl : public DB {
private:
friend
class
DB
;
struct
CompactionState
;
struct
Writer
;
Iterator
*
NewInternalIterator
(
const
ReadOptions
&
,
SequenceNumber
*
latest_snapshot
);
...
...
@@ -85,14 +88,8 @@ class DBImpl : public DB {
Status
WriteLevel0Table
(
MemTable
*
mem
,
VersionEdit
*
edit
,
Version
*
base
);
// Only thread is allowed to log at a time.
struct
LoggerId
{
};
// Opaque identifier for logging thread
void
AcquireLoggingResponsibility
(
LoggerId
*
self
);
void
ReleaseLoggingResponsibility
(
LoggerId
*
self
);
Status
MakeRoomForWrite
(
bool
force
/* compact even if there is room? */
);
struct
CompactionState
;
WriteBatch
*
BuildBatchGroup
(
Writer
**
last_writer
);
void
MaybeScheduleCompaction
();
static
void
BGWork
(
void
*
db
);
...
...
@@ -129,8 +126,11 @@ class DBImpl : public DB {
WritableFile
*
logfile_
;
uint64_t
logfile_number_
;
log
::
Writer
*
log_
;
LoggerId
*
logger_
;
// NULL, or the id of the current logging thread
port
::
CondVar
logger_cv_
;
// For threads waiting to log
// Queue of writers.
std
::
deque
<
Writer
*>
writers_
;
WriteBatch
*
tmp_batch_
;
SnapshotList
snapshots_
;
// Set of table files to protect from deletion because they are
...
...
db/write_batch.cc
浏览文件 @
d79762e2
...
...
@@ -23,6 +23,9 @@
namespace
leveldb
{
// WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
static
const
size_t
kHeader
=
12
;
WriteBatch
::
WriteBatch
()
{
Clear
();
}
...
...
@@ -33,16 +36,16 @@ WriteBatch::Handler::~Handler() { }
void
WriteBatch
::
Clear
()
{
rep_
.
clear
();
rep_
.
resize
(
12
);
rep_
.
resize
(
kHeader
);
}
Status
WriteBatch
::
Iterate
(
Handler
*
handler
)
const
{
Slice
input
(
rep_
);
if
(
input
.
size
()
<
12
)
{
if
(
input
.
size
()
<
kHeader
)
{
return
Status
::
Corruption
(
"malformed WriteBatch (too small)"
);
}
input
.
remove_prefix
(
12
);
input
.
remove_prefix
(
kHeader
);
Slice
key
,
value
;
int
found
=
0
;
while
(
!
input
.
empty
())
{
...
...
@@ -131,8 +134,14 @@ Status WriteBatchInternal::InsertInto(const WriteBatch* b,
}
void
WriteBatchInternal
::
SetContents
(
WriteBatch
*
b
,
const
Slice
&
contents
)
{
assert
(
contents
.
size
()
>=
12
);
assert
(
contents
.
size
()
>=
kHeader
);
b
->
rep_
.
assign
(
contents
.
data
(),
contents
.
size
());
}
void
WriteBatchInternal
::
Append
(
WriteBatch
*
dst
,
const
WriteBatch
*
src
)
{
SetCount
(
dst
,
Count
(
dst
)
+
Count
(
src
));
assert
(
src
->
rep_
.
size
()
>=
kHeader
);
dst
->
rep_
.
append
(
src
->
rep_
.
data
()
+
kHeader
,
src
->
rep_
.
size
()
-
kHeader
);
}
}
// namespace leveldb
db/write_batch_internal.h
浏览文件 @
d79762e2
...
...
@@ -39,6 +39,8 @@ class WriteBatchInternal {
static
void
SetContents
(
WriteBatch
*
batch
,
const
Slice
&
contents
);
static
Status
InsertInto
(
const
WriteBatch
*
batch
,
MemTable
*
memtable
);
static
void
Append
(
WriteBatch
*
dst
,
const
WriteBatch
*
src
);
};
}
// namespace leveldb
...
...
db/write_batch_test.cc
浏览文件 @
d79762e2
...
...
@@ -18,6 +18,7 @@ static std::string PrintContents(WriteBatch* b) {
mem
->
Ref
();
std
::
string
state
;
Status
s
=
WriteBatchInternal
::
InsertInto
(
b
,
mem
);
int
count
=
0
;
Iterator
*
iter
=
mem
->
NewIterator
();
for
(
iter
->
SeekToFirst
();
iter
->
Valid
();
iter
->
Next
())
{
ParsedInternalKey
ikey
;
...
...
@@ -29,11 +30,13 @@ static std::string PrintContents(WriteBatch* b) {
state
.
append
(
", "
);
state
.
append
(
iter
->
value
().
ToString
());
state
.
append
(
")"
);
count
++
;
break
;
case
kTypeDeletion
:
state
.
append
(
"Delete("
);
state
.
append
(
ikey
.
user_key
.
ToString
());
state
.
append
(
")"
);
count
++
;
break
;
}
state
.
append
(
"@"
);
...
...
@@ -42,6 +45,8 @@ static std::string PrintContents(WriteBatch* b) {
delete
iter
;
if
(
!
s
.
ok
())
{
state
.
append
(
"ParseError()"
);
}
else
if
(
count
!=
WriteBatchInternal
::
Count
(
b
))
{
state
.
append
(
"CountMismatch()"
);
}
mem
->
Unref
();
return
state
;
...
...
@@ -82,6 +87,32 @@ TEST(WriteBatchTest, Corruption) {
PrintContents
(
&
batch
));
}
TEST
(
WriteBatchTest
,
Append
)
{
WriteBatch
b1
,
b2
;
WriteBatchInternal
::
SetSequence
(
&
b1
,
200
);
WriteBatchInternal
::
SetSequence
(
&
b2
,
300
);
WriteBatchInternal
::
Append
(
&
b1
,
&
b2
);
ASSERT_EQ
(
""
,
PrintContents
(
&
b1
));
b2
.
Put
(
"a"
,
"va"
);
WriteBatchInternal
::
Append
(
&
b1
,
&
b2
);
ASSERT_EQ
(
"Put(a, va)@200"
,
PrintContents
(
&
b1
));
b2
.
Clear
();
b2
.
Put
(
"b"
,
"vb"
);
WriteBatchInternal
::
Append
(
&
b1
,
&
b2
);
ASSERT_EQ
(
"Put(a, va)@200"
"Put(b, vb)@201"
,
PrintContents
(
&
b1
));
b2
.
Delete
(
"foo"
);
WriteBatchInternal
::
Append
(
&
b1
,
&
b2
);
ASSERT_EQ
(
"Put(a, va)@200"
"Put(b, vb)@202"
"Put(b, vb)@201"
"Delete(foo)@203"
,
PrintContents
(
&
b1
));
}
}
// namespace leveldb
int
main
(
int
argc
,
char
**
argv
)
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录