未验证 提交 be304f53 编写于 作者: J Jiawei Wang 提交者: GitHub

Merge pull request #1336 from bjjwwang/cube_062

Add seq reader tool and doc
......@@ -39,4 +39,8 @@ target_link_libraries(cube-builder ${DYNAMIC_LIB})
# install
install(TARGETS cube-builder RUNTIME DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/bin)
install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/tool DESTINATION ${PADDLE_SERVING_INSTALL_DIR})
install(FILES ${CMAKE_CURRENT_LIST_DIR}/tool/kvtool.py DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/tool)
install(FILES ${CMAKE_CURRENT_LIST_DIR}/tool/kv_to_seqfile.py DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/tool)
install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/tool/source DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/tool)
......@@ -169,7 +169,7 @@ int GeneralDistKVInferOp::inference() {
// call paddle inference here
if (InferManager::instance().infer(
engine_name().c_str(), &infer_in, out, batch_size)) {
LOG(ERROR) << << "(logid=" << log_id << ") Failed do infer in fluid model: " << engine_name();
LOG(ERROR) << "(logid=" << log_id << ") Failed do infer in fluid model: " << engine_name();
return -1;
}
int64_t end = timeline.TimeStampUS();
......
......@@ -2,3 +2,16 @@ set(seq_gen_src ${CMAKE_CURRENT_LIST_DIR}/seq_generator.cpp ${CMAKE_CURRENT_LIS
LIST(APPEND seq_gen_src ${PROTO_SRCS})
add_executable(seq_generator ${seq_gen_src})
target_link_libraries(seq_generator protobuf -lpthread)
set(seq_reader_src ${CMAKE_CURRENT_LIST_DIR}/seq_reader.cpp ${CMAKE_CURRENT_LIST_DIR}/../../cube/cube-builder/src/seqfile_reader.cpp)
add_executable(seq_reader ${seq_reader_src})
install(TARGETS seq_reader
RUNTIME DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/bin
ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
LIBRARY DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/so
)
install(TARGETS seq_reader RUNTIME DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/tool)
install(TARGETS seq_generator RUNTIME DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/tool)
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <sys/time.h>
#include <limits.h>
#include <fstream>
#include <iostream>
#include <memory>
#include <thread>
#include "core/cube/cube-builder/include/cube-builder/seqfile_reader.h"
std::string string_to_hex(const std::string& input) {
static const char* const lut = "0123456789ABCDEF";
size_t len = input.length();
std::string output;
output.reserve(2 * len);
for (size_t i = 0; i < len; ++i) {
const unsigned char c = input[i];
output.push_back(lut[c >> 4]);
output.push_back(lut[c & 15]);
}
return output;
}
void printSeq(std::string file, int limit) {
SequenceFileRecordReader reader(file.c_str());
if (reader.open() != 0) {
std::cerr << "open file failed! " << file;
return;
}
if (reader.read_header() != 0) {
std::cerr << "read header error! " << file;
reader.close();
return;
}
Record record(reader.get_header());
int total_count = 0;
while (reader.next(&record) == 0) {
uint64_t key =
*reinterpret_cast<uint64_t *>(const_cast<char *>(record.key.data()));
total_count++;
int64_t value_length = record.record_len - record.key_len;
std::cout << "key: " << key << " , value: " << string_to_hex(record.value.c_str()) << std::endl;
if (total_count >= limit) {
break;
}
}
if (reader.close() != 0) {
std::cerr << "close file failed! " << file;
return;
}
}
int main(int argc, char **argv) {
if (argc != 3 && argc != 2) {
std::cout << "Seq Reader Usage:" << std::endl;
std::cout << "get all keys: ./seq_reader $FILENAME " << std::endl;
std::cout << "get some keys: ./seq_reader $FILENAME $KEY_NUM" << std::endl;
return -1;
}
if (argc == 3 || argc == 2) {
const char* filename_str = argv[1];
std::cout << "cstr filename is " << filename_str << std::endl;
std::string filename = filename_str;
std::cout << "filename is " << filename << std::endl;
if (argc == 3) {
const char* key_num_str = argv[2];
int key_limit = std::stoi(key_num_str);
printSeq(filename, key_limit);
} else {
printSeq(filename, INT_MAX);
}
}
return 0;
}
......@@ -81,7 +81,7 @@ export PATH=$PATH:$GOPATH/bin
## Get go packages
```shell
go env -w GO111MODULE=on
go env -w GO111MODULE=auto
go env -w GOPROXY=https://goproxy.cn,direct
go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway@v1.15.2
go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger@v1.15.2
......
......@@ -80,7 +80,7 @@ export PATH=$PATH:$GOPATH/bin
## 获取 Go packages
```shell
go env -w GO111MODULE=on
go env -w GO111MODULE=auto
go env -w GOPROXY=https://goproxy.cn,direct
go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway@v1.15.2
go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger@v1.15.2
......
## 如果获得稀疏参数索引Cube所需的模型输入
#### 背景知识
推荐系统需要大规模稀疏参数索引来帮助分布式部署,可在`python/example/criteo_ctr_with_cube`或是[PaddleRec](https://github.com/paddlepaddle/paddlerec)了解推荐模型。
稀疏参数索引的模型格式是SequenceFile,源自Hadoop生态的键值对格式文件。
为了方便调试,我们给出了从特定格式的可读文本文件到SequenceFile格式文件的转换工具,以及SequenceFile格式文件与可阅读文字的转换。
用户在调试Cube服务功能时,可以自定义KV对生成SequenceFile格式文件来进行调试。
用户在验证Cube的配送正确性时,可以转换SequenceFile格式文件至可读文字来进行比对验证。
#### 预备知识
- 需要会编译Paddle Serving,参见[编译文档](./COMPILE.md)
#### 用法
在编译结束后的安装文件,可以得到 seq_reader 和 kv_to_seqfile.py。
#### 生成SequenceFile
`output/tool/`下,修改`output/tool/source/file.txt`,该文件每一行对应一个键值对,用冒号`:`区分key和value部分。
例如:
```
1676869128226002114:48241 37064 91 -539 114 51 -122 269 229 -134 -282
1657749292782759014:167 40 98 27 117 10 -29 15 74 67 -54
```
执行
```
python kv_to_seqfile.py
```
即可生成`data`文件夹,我们看下它的结构
```
.
├── 20210805095422
│   └── base
│   └── feature
└── donefile
└── base.txt
```
其中`20210805095422/base/feature` 就是SequenceFile格式文件,donefile保存在`donefile/base.txt`
#### 查看SequenceFile
我们使用`seq_reader`工具来解读SequenceFile格式文件。
```
./seq_reader 20210805095422/base/feature 10 # 阅读开头的10个KV对
./seq_reader 20210805095422/base/feature # 阅读所有KV对
```
结果
```
key: 1676869128226002114 , value: 343832343109333730363409093931092D35333909313134093531092D3132320932363909323239092D313334092D323832
key: 1657749292782759014 , value: 3136370934300909393809323709313137093130092D3239093135093734093637092D3534
```
其中value 我们目前都以16进制的形式打印。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册