提交 766adaea 编写于 作者: Z zhouyingfeng

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into stat_usage

......@@ -107,7 +107,6 @@ function(link_paddle_exe TARGET_NAME)
paddle_parameter
paddle_proto
paddle_cuda
paddle_test_main
${METRIC_LIBS}
${PROTOBUF_LIBRARY}
${LIBGLOG_LIBRARY}
......@@ -155,8 +154,9 @@ endfunction()
# Rest Arguemnts: not used.
function(link_paddle_test TARGET_NAME)
link_paddle_exe(${TARGET_NAME})
target_link_libraries(${TARGET_NAME} ${GTEST_MAIN_LIBRARIES}
${GTEST_LIBRARIES})
target_link_libraries(${TARGET_NAME}
paddle_test_main
${GTEST_LIBRARIES})
endfunction()
# add_unittest_without_exec
......
......@@ -16,10 +16,9 @@ set -e
set -x
BASE_URL='http://paddlepaddle.cdn.bcebos.com/model_zoo/embedding'
wget ${BASE_URL}/baidu.dict
DOWNLOAD_ITEMS=(model_32.emb model_64.emb model_128.emb model_256.emb)
ITEM_MD5=(f88c8325ee6da6187f1080e8fe66c1cd
DOWNLOAD_ITEMS=(baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb)
ITEM_MD5=(fa03a12321eaab6c30a8fcc9442eaea3
f88c8325ee6da6187f1080e8fe66c1cd
927cf70f27f860aff1a5703ebf7f1584
a52e43655cd25d279777ed509a1ae27b
b92c67fe9ff70fea53596080e351ac80)
......
#!/usr/bin/python
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import re
import math
def get_best_pass(log_filename):
with open(log_filename, 'r') as f:
text = f.read()
pattern = re.compile('Test.*? cost=([0-9]+\.[0-9]+).*?pass-([0-9]+)',
re.S)
results = re.findall(pattern, text)
sorted_results = sorted(results, key=lambda result: float(result[0]))
return sorted_results[0]
log_filename = sys.argv[1]
log = get_best_pass(log_filename)
predict_error = math.sqrt(float(log[0])) / 2
print 'Best pass is %s, error is %s, which means predict get error as %f' % (
log[1], log[0], predict_error)
evaluate_pass = "output/pass-%s" % log[1]
print "evaluating from pass %s" % evaluate_pass
此差异已折叠。
# Paddle On Kubernetes
>In this article, we will introduce how to run Paddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run Paddle training job on distributed cluster.
## Build Docker Image
In distributed Kubernetes cluster, we will use Ceph or other shared storage system for storing training related data so that all processes in Paddle training can retrieve data from Ceph. In this example, we will only demo training job on single machine. In order to simplify the requirement of the environment, we will directly put training data into Paddle's Docker Image, so we need to create a Paddle Docker image that already includes the training data.
Paddle's [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) introduces how to download and train data by using script from Paddle's source code.
And `paddledev/paddle:cpu-demo-latest` image has the Paddle source code and demo. (Caution: Default Paddle image `paddledev/paddle:cpu-latest` doesn't include the source code, Paddle's different versions of image can be referred here: [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html)), so we run this container and download the training data, and then commit the whole container to be a new Docker image.
### Run Docker Container
```
$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
```
### Download Training Data
Getting into `/root/paddle/demo/quick_start/data` Directory,using `get_data.sh` to download training data.
Then getting into `/root/paddle/demo/quick_start` Directory, using `preprocess.sh` to pre-process training data.
```
$ root@fbd1f2bb71f4:~/paddle/demo/quick_start/data# ./get_data.sh
Downloading Amazon Electronics reviews data...
--2016-10-31 01:33:43-- http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 495854086 (473M) [application/x-gzip]
Saving to: 'reviews_Electronics_5.json.gz'
10% [=======> ] 874,279 64.7KB/s eta 2h 13m
```
### Modify Startup Script
After downloading the data,modify `/root/paddle/demo/quick_start/train.sh` file contents are as follows (one more cd cmd):
```
set -e
cd /root/paddle/demo/quick_start
cfg=trainer_config.lr.py
#cfg=trainer_config.emb.py
#cfg=trainer_config.cnn.py
#cfg=trainer_config.lstm.py
#cfg=trainer_config.bidi-lstm.py
#cfg=trainer_config.db-lstm.py
paddle train \
--config=$cfg \
--save_dir=./output \
--trainer_count=4 \
--log_period=20 \
--num_passes=15 \
--use_gpu=false \
--show_parameter_stats_period=100 \
--test_all_data_in_one_period=1 \
2>&1 | tee 'train.log'
```
### Commit Docker Image
```
$ docker commit quick_start_data mypaddle/paddle:quickstart
```
## Use Kubernetes For Training
>We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.
### Create Yaml Files
The output result in container will be demolished when job finished (container stopped running), so we need to mount the volume out to the local disk when creating the container to store the training result. Using our previously created image, we can create a [Kubernetes Job](http://kubernetes.io/docs/user-guide/jobs/#what-is-a-job), the yaml contents are as follows:
```
apiVersion: batch/v1
kind: Job
metadata:
name: quickstart
spec:
parallelism: 1
completions: 1
template:
metadata:
name: quickstart
spec:
volumes:
- name: output
hostPath:
path: /home/work/paddle_output
containers:
- name: pi
image: mypaddle/paddle:quickstart
command: ["bin/bash", "-c", "/root/paddle/demo/quick_start/train.sh"]
volumeMounts:
- name: output
mountPath: /root/paddle/demo/quick_start/output
restartPolicy: Never
```
### Start Paddle Job
Using the above yaml file to start the Kubernetes job.
```
$ kubectl create -f paddle.yaml
```
Get the detailed status of the job:
```
$ kubectl get job
NAME DESIRED SUCCESSFUL AGE
quickstart 1 0 58s
$ kubectl describe job quickstart
Name: quickstart
Namespace: default
Image(s): registry.baidu.com/public/paddle:cpu-demo-latest
Selector: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84
Parallelism: 1
Completions: 1
Start Time: Mon, 31 Oct 2016 11:20:16 +0800
Labels: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
Pods Statuses: 0 Running / 1 Succeeded / 0 Failed
Volumes:
output:
Type: HostPath (bare host directory volume)
Path: /home/work/paddle_output
Events:
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
--------- -------- ----- ---- ------------- -------- ------ -------
1m 1m 1 {job-controller } Normal SuccessfulCreate Created pod: quickstart-fa0wx
```
### Get Training Result
We can use kubectl command to take a look at the status of related pod.
```
$ kubectl describe pod quickstart-fa0wx
Name: quickstart-fa0wx
Namespace: default
Node: paddle-demo-let02/10.206.202.44
Start Time: Mon, 31 Oct 2016 11:20:17 +0800
Labels: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
Status: Succeeded
IP: 10.0.0.9
Controllers: Job/quickstart
Containers:
quickstart:
Container ID: docker://b8561f5c79193550d64fa47418a9e67ebdd71546186e840f88de5026b8097465
Image: registry.baidu.com/public/paddle:cpu-demo-latest
Image ID: docker://18e457ce3d362ff5f3febf8e7f85ffec852f70f3b629add10aed84f930a68750
Port:
Command:
bin/bash
-c
/root/paddle/demo/quick_start/train.sh
QoS Tier:
cpu: BestEffort
memory: BestEffort
State: Terminated
Reason: Completed
Exit Code: 0
Started: Mon, 31 Oct 2016 11:20:20 +0800
Finished: Mon, 31 Oct 2016 11:21:46 +0800
Ready: False
Restart Count: 0
Environment Variables:
Conditions:
Type Status
Ready False
Volumes:
output:
Type: HostPath (bare host directory volume)
Path: /home/work/paddle_output
```
We can also ssh to Kubernetes node to take a look at the training result.
```
[root@paddle-demo-let02 paddle_output]# ll
total 60
drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00000
drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00001
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00002
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00003
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00004
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00005
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00006
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00007
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00008
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00009
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00010
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00011
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00012
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00013
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00014
```
add_subdirectory(cuda)
add_subdirectory(function)
add_subdirectory(utils)
add_subdirectory(testing)
add_subdirectory(math)
add_subdirectory(parameter)
add_subdirectory(gserver)
......
......@@ -178,6 +178,7 @@ namespace std {
%newobject ParameterOptimizer::create;
%newobject ParameterOptimizer::needSpecialTraversal;
%newobject ParameterUpdater::createLocalUpdater;
%newobject ParameterUpdater::createRemoteUpdater;
%feature("director") UpdateCallback;
%feature("autodoc", 1); // To generate method stub, for code hint in ide
......
......@@ -803,6 +803,8 @@ private:
public:
static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
int passCount);
~ParameterUpdater();
/**
......
......@@ -15,15 +15,25 @@ limitations under the License. */
#include "PaddleAPI.h"
#include "PaddleAPIPrivate.h"
#include "paddle/trainer/RemoteParameterUpdater.h"
#include "paddle/trainer/ThreadParameterUpdater.h"
ParameterUpdater::ParameterUpdater() : m(new ParameterUpdaterPrivate()) {}
ParameterUpdater *ParameterUpdater::createLocalUpdater(
OptimizationConfig *config) {
auto param = new ParameterUpdater();
param->m->updater.reset(new paddle::SgdThreadUpdater(config->m->getConfig()));
return param;
auto updater = new ParameterUpdater();
updater->m->updater.reset(
new paddle::SgdThreadUpdater(config->m->getConfig()));
return updater;
}
ParameterUpdater *ParameterUpdater::createRemoteUpdater(
OptimizationConfig *config, int passCount) {
auto updater = new ParameterUpdater();
updater->m->updater.reset(new paddle::RemoteParameterUpdater(
config->m->getConfig(), passCount, nullptr));
return updater;
}
ParameterUpdater::~ParameterUpdater() { delete m; }
......
......@@ -48,78 +48,6 @@ extern void hl_max_sequence_forward(real* input,
extern void hl_max_sequence_backward(
real* outputGrad, int* index, real* inputGrad, int numSequences, int dim);
/**
* @brief Context projection forward.
*
* @param[in] input input sequence.
* @param[in] sequence sequence index.
* @param[in] weightData padding data.
* @param[out] output output sequence.
* @param[in] numSequences number of sequences.
* @param[in] inputDim input sequence dimension.
* @param[in] contextLength context length.
* @param[in] contextStart context start.
* @param[in] beginPad number of extra timesteps added at the
* beginning.
* @param[in] isPadding trainable padding.
*
*/
extern void hl_context_projection_forward(real* input,
const int* sequence,
real* weightData,
real* output,
int numSequences,
int inputDim,
int contextLength,
int contextStart,
int beginPad,
bool isPadding);
/**
* @brief Context projection backward data.
*
* @param[in] outputGrad output gradient.
* @param[in] sequence sequence index.
* @param[out] inputGrad input gradient.
* @param[in] numSequences number of sequences.
* @param[in] inputDim input sequence dimension.
* @param[in] contextLength context length.
* @param[in] contextStart context start.
*
*/
extern void hl_context_projection_backward_data(real* outputGrad,
const int* sequence,
real* inputGrad,
int numSequences,
int inputDim,
int contextLength,
int contextStart);
/**
* @brief Context projection backward weight.
*
* @param[in] outputGrad output gradient.
* @param[in] sequence sequence index.
* @param[out] weightGrad weight gradient.
* @param[in] numSequences number of sequences.
* @param[in] weightDim input sequence dimension.
* @param[in] totalPad number of extra timesteps.
* @param[in] contextLength context length.
* @param[in] contextStart context start.
* @param[in] beginPad number of extra timesteps added at the
* beginning.
*
*/
extern void hl_context_projection_backward_weight(real* outputGrad,
const int* sequence,
real* weightGrad,
int numSequences,
int weightDim,
int totalPad,
int contextLength,
int contextStart,
int beginPad);
/**
* @brief Memory copy from sequence to batch.
*
......
......@@ -27,35 +27,6 @@ inline void hl_max_sequence_forward(real* input,
inline void hl_max_sequence_backward(
real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {}
inline void hl_context_projection_forward(real* input,
const int* sequence,
real* weightData,
real* output,
int numSequences,
int inputDim,
int contextLength,
int contextStart,
int beginPad,
bool isPadding) {}
inline void hl_context_projection_backward_data(real* outputGrad,
const int* sequence,
real* inputGrad,
int numSequences,
int inputDim,
int contextLength,
int contextStart) {}
inline void hl_context_projection_backward_weight(real* outputGrad,
const int* sequence,
real* weightGrad,
int numSequences,
int weightDim,
int totalPad,
int contextLength,
int contextStart,
int beginPad) {}
inline void hl_sequence2batch_copy(real* batch,
real* sequence,
const int* batchIndex,
......
......@@ -90,258 +90,6 @@ void hl_max_sequence_backward(real* outputGrad,
CHECK_SYNC("hl_max_sequence_backward failed");
}
template <bool padding>
__global__ void KeContextProjectionForward(real* input,
const int* sequence,
real* weightData,
real* output,
int inputDim,
int contextLength,
int contextStart,
int beginPad) {
int idx = threadIdx.x;
int blockSize = blockDim.x;
int sequenceId = blockIdx.x;
int seqStart = sequence[sequenceId];
int seqEnd = sequence[sequenceId+1];
real value = 0;
int instances = seqEnd - seqStart + contextLength - 1;
output += seqStart * inputDim * contextLength;
input += seqStart * inputDim;
for (int k = 0; k <= inputDim / blockSize; k++) {
if (idx < inputDim) {
for (int i = 0; i < instances; i++) {
// i + contextStart;
if ((i + contextStart) < 0) {
if (padding) {
value = weightData[i * inputDim + idx];
} else {
continue;
}
} else if ((i + contextStart) >= (seqEnd - seqStart)) {
if (padding) {
value =
weightData[(beginPad + i + contextStart - (seqEnd - seqStart)) *
inputDim + idx];
} else {
continue;
}
} else {
value = input[(i + contextStart) * inputDim + idx];
}
int outx = (i - contextLength) < 0 ? i : (contextLength - 1);
int outy = (i - contextLength) < 0 ? 0 : (i - (contextLength - 1));
real* output_r =
output + outy * inputDim * contextLength + outx * inputDim;
for (int j = outy; j < seqEnd - seqStart; j++) {
output_r[idx] += value;
if (j - outy == outx) break;
output_r += (contextLength - 1) * inputDim;
}
}
}
idx += blockSize;
}
}
void hl_context_projection_forward(real* input,
const int* sequence,
real* weightData,
real* output,
int numSequences,
int inputDim,
int contextLength,
int contextStart,
int beginPad,
bool isPadding) {
CHECK_NOTNULL(input);
CHECK_NOTNULL(sequence);
CHECK_NOTNULL(output);
CHECK(!isPadding || weightData);
int blockSize = 128;
int blocksX = numSequences;
int blocksY = 1;
dim3 threads(blockSize, 1);
dim3 grid(blocksX, blocksY);
if (isPadding) {
KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
(input, sequence, weightData, output, inputDim,
contextLength, contextStart, beginPad);
} else {
KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
(input, sequence, weightData, output, inputDim,
contextLength, contextStart, beginPad);
}
CHECK_SYNC("hl_context_projection_forward failed");
}
__global__ void KeContextProjectionBackwardData(real* outputGrad,
const int* sequence,
real* inputGrad,
int inputDim,
int contextLength,
int contextStart) {
int idx = threadIdx.x;
int blockSize = blockDim.x;
int sequenceId = blockIdx.x;
int seqStart = sequence[sequenceId];
int seqEnd = sequence[sequenceId+1];
real value = 0;
int instances = seqEnd - seqStart + contextLength - 1;
outputGrad += seqStart * inputDim * contextLength;
inputGrad += seqStart * inputDim;
for (int k = 0; k <= inputDim / blockSize; k++) {
if (idx < inputDim) {
for (int i = 0; i < instances; i++) {
if ((i + contextStart) < 0) {
continue;
} else if ((i + contextStart) >= (seqEnd - seqStart)) {
continue;
} else {
// value = 0;
value = inputGrad[(i + contextStart) * inputDim + idx];
}
int outx = (i - contextLength) < 0 ? i : (contextLength - 1);
int outy = (i - contextLength) < 0 ? 0 : (i - (contextLength - 1));
real* output_r =
outputGrad + outy * inputDim * contextLength + outx * inputDim;
for (int j = outy; j < seqEnd - seqStart; j++) {
value += output_r[idx];
if (j - outy == outx) break;
output_r += (contextLength - 1) * inputDim;
}
inputGrad[(i + contextStart) * inputDim + idx] = value;
}
}
idx += blockSize;
}
}
void hl_context_projection_backward_data(real* outputGrad,
const int* sequence,
real* inputGrad,
int numSequences,
int inputDim,
int contextLength,
int contextStart) {
CHECK_NOTNULL(outputGrad);
CHECK_NOTNULL(sequence);
CHECK_NOTNULL(inputGrad);
int blockSize = 128;
int blocksX = numSequences;
int blocksY = 1;
dim3 threads(blockSize, 1);
dim3 grid(blocksX, blocksY);
KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
(outputGrad, sequence, inputGrad, inputDim, contextLength, contextStart);
CHECK_SYNC("hl_context_projection_backward_data failed");
}
template<int THREADS_X, int THREADS_Y>
__global__ void KeContextProjectionBackwardWeight(real* outputGrad,
const int* sequence,
real* weightGrad,
int numSequences,
int weightDim,
int contextLength,
int contextStart,
int beginPad) {
__shared__ real sum_s[THREADS_Y][THREADS_X];
int padOfBlock = (weightDim + THREADS_X - 1) / THREADS_X;
const int idx = threadIdx.x;
const int idy = threadIdx.y;
int padId = blockIdx.x / padOfBlock;
int weightIdx = idx + THREADS_X * (blockIdx.x % padOfBlock);
int instanceId;
real value = 0;
real* output_r;
sum_s[idy][idx] = 0.0f;
if (weightIdx < weightDim) {
for (int seqId = idy; seqId < numSequences; seqId += THREADS_Y) {
int seqStart = sequence[seqId];
int seqEnd = sequence[seqId+1];
output_r = outputGrad + seqStart * weightDim * contextLength;
if (contextStart < 0) {
if (padId + contextStart < 0) {
instanceId = padId;
} else {
// beginPad > 0;
instanceId = (padId - beginPad) + (seqEnd - seqStart) - contextStart;
}
} else {
if (padId + (seqEnd - seqStart) < contextStart) {
continue;
} else {
// beginPad == 0;
instanceId = padId + (seqEnd - seqStart) - contextStart;
}
}
int outx = (instanceId - contextLength) < 0 ?
instanceId : (contextLength - 1);
int outy = (instanceId - contextLength) < 0 ?
0 : (instanceId - (contextLength - 1));
output_r += outy * weightDim * contextLength + outx * weightDim;
for (int j = outy; j < seqEnd - seqStart; j++) {
value += output_r[weightIdx];
if (j - outy == outx) break;
output_r += (contextLength - 1) * weightDim;
}
}
sum_s[idy][idx] = value;
}
__syncthreads();
for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
if (idy < stride) {
sum_s[idy][idx] += sum_s[idy + stride][idx];
}
__syncthreads();
}
__syncthreads();
if (weightIdx < weightDim) {
if (idy == 0) {
weightGrad[padId * weightDim + weightIdx] += sum_s[0][idx];
}
}
}
void hl_context_projection_backward_weight(real* outputGrad,
const int* sequence,
real* weightGrad,
int numSequences,
int weightDim,
int totalPad,
int contextLength,
int contextStart,
int beginPad) {
CHECK_NOTNULL(outputGrad);
CHECK_NOTNULL(sequence);
CHECK_NOTNULL(weightGrad);
int threadsX = 32;
int threadsY = 32;
int blocksX = totalPad * ((weightDim + threadsX - 1) / threadsX);
dim3 threads(threadsX, threadsY);
dim3 grid(blocksX, 1);
KeContextProjectionBackwardWeight<32, 32>
<<< grid, threads, 0, STREAM_DEFAULT >>>
(outputGrad, sequence, weightGrad, numSequences, weightDim,
contextLength, contextStart, beginPad);
CHECK_SYNC("hl_context_projection_backward_weight failed");
}
template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
__global__ void KeMatrixAddRows(real* output,
real* table,
......
......@@ -11,13 +11,16 @@ endif()
add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
add_library(paddle_test_main STATIC TestMain.cpp)
if(WITH_GPU)
if(WITH_TESTING)
# TODO:
# file(GLOB test_files . *OpTest.cpp)
# add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
add_simple_unittest(CrossMapNormalOpTest)
add_unittest(ContextProjectionOpTest
ContextProjectionOpTest.cpp
../gserver/tests/TestUtil.cpp)
endif()
endif()
add_style_check_target(paddle_function ${h_files})
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "ContextProjectionOp.h"
#include "paddle/math/Matrix.h"
#include "paddle/math/Vector.h"
namespace paddle {
template <>
void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix* out_mat,
const CpuMatrix* input_mat,
const CpuMatrix* weight_mat,
const CpuIVector& seq_vec,
size_t context_length,
int context_start,
size_t begin_pad) {
const int* starts = seq_vec.getData();
const size_t num_sequences = seq_vec.getSize() - 1;
auto w_mat = const_cast<CpuMatrix*>(weight_mat);
auto in_mat = const_cast<CpuMatrix*>(input_mat);
for (size_t i = 0; i < num_sequences; ++i) {
for (size_t j = 0; j < context_length; ++j) {
int begin = starts[i] + context_start + j;
int end = starts[i + 1] + context_start + j;
int dst_begin = starts[i];
int dst_end = starts[i + 1];
if (begin < starts[i]) {
int64_t pad_size =
std::min(starts[i] - begin, starts[i + 1] - starts[i]);
MatrixPtr mat = out_mat->subMatrix(starts[i], pad_size);
if (w_mat) {
MatrixPtr sub = w_mat->subMatrix(j, pad_size);
mat->addAtOffset(*sub, j * in_mat->getWidth());
}
dst_begin = starts[i] + pad_size;
begin = starts[i];
}
if (end > starts[i + 1]) {
int64_t pad_size =
std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
MatrixPtr mat = out_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
if (w_mat) {
MatrixPtr sub = w_mat->subMatrix(
begin_pad + context_start + j - pad_size, pad_size);
mat->addAtOffset(*sub, j * in_mat->getWidth());
}
dst_end = starts[i + 1] - pad_size;
end = starts[i + 1];
}
if (end <= begin) continue;
MatrixPtr src = in_mat->subMatrix(begin, end - begin);
MatrixPtr dst = out_mat->subMatrix(dst_begin, dst_end - dst_begin);
dst->addAtOffset(*src, j * in_mat->getWidth());
}
}
}
/**
* \param inputs[0] input value.
* \param inputs[1] input weight.
* \param inputs[2] input sequence.
* \param outputs[0] output value.
*/
template <DeviceType Device>
class ContextProjectionForwardFunc : public FunctionBase {
public:
void init(const FuncConfig& config) override {
context_length_ = config.get<size_t>("context_length");
context_start_ = config.get<int>("context_start");
begin_pad_ = config.get<size_t>("begin_pad");
}
void calc(const Arguments& inputs,
const Arguments& outputs,
const Arguments& inouts) override {
CHECK_EQ(3, inputs.size());
CHECK_EQ(1, outputs.size());
CHECK_EQ(0, inouts.size());
CHECK(outputs[0].getData() && inputs[0].getData() && inputs[2].getData());
CHECK_EQ(outputs[0].dims_.size(), 2);
CHECK_EQ(inputs[0].dims_.size(), 2);
CHECK_EQ(inputs[1].dims_.size(), 2);
CHECK_EQ(inputs[2].dims_.size(), 1);
/// dim of output = dim of input * context_length
CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
/// dim of input == dim of weight
CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
/// input and output has the same batch_size
CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
auto out_mat = std::make_shared<typename MatrixT<Device>::type>(
outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
const auto in_mat = std::make_shared<typename MatrixT<Device>::type>(
inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
const auto w_mat =
!inputs[1].getData()
? nullptr
: std::make_shared<typename MatrixT<Device>::type>(
inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
typename SequenceT<Device>::type seq_vec(
inputs[2].dims_[0], reinterpret_cast<int*>(inputs[2].getData()));
ContextProjectionForward<Device>(out_mat.get(),
in_mat.get(),
w_mat.get(),
seq_vec,
context_length_,
context_start_,
begin_pad_);
}
private:
size_t context_length_;
int context_start_;
size_t begin_pad_;
};
template <>
void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix* out_grad_mat,
CpuMatrix* in_grad_mat,
CpuMatrix* w_grad_mat,
const CpuIVector& seq_vec,
size_t context_length,
int context_start,
size_t begin_pad,
bool is_padding,
size_t total_pad) {
CHECK(out_grad_mat);
size_t input_dim = in_grad_mat ? in_grad_mat->getWidth()
: w_grad_mat ? w_grad_mat->getWidth() : 0;
const int* starts = seq_vec.getData();
size_t num_sequences = seq_vec.getSize() - 1;
for (size_t i = 0; i < num_sequences; ++i) {
for (size_t j = 0; j < context_length; ++j) {
int begin = starts[i] + context_start + j;
int end = starts[i + 1] + context_start + j;
int dst_begin = starts[i];
int dst_end = starts[i + 1];
if (begin < starts[i]) {
int64_t pad_size =
std::min(starts[i] - begin, starts[i + 1] - starts[i]);
if (is_padding && w_grad_mat) {
MatrixPtr mat = out_grad_mat->subMatrix(starts[i], pad_size);
MatrixPtr sub = w_grad_mat->subMatrix(j, pad_size);
sub->addAtOffset(*mat, j * input_dim);
}
dst_begin = starts[i] + pad_size;
begin = starts[i];
}
if (end > starts[i + 1]) {
int64_t pad_size =
std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
if (is_padding && w_grad_mat) {
MatrixPtr mat =
out_grad_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
MatrixPtr sub = w_grad_mat->subMatrix(
begin_pad + context_start + j - pad_size, pad_size);
sub->addAtOffset(*mat, j * input_dim);
}
dst_end = starts[i + 1] - pad_size;
end = starts[i + 1];
}
if (end <= begin) continue;
if (!in_grad_mat) continue;
MatrixPtr src = in_grad_mat->subMatrix(begin, end - begin);
MatrixPtr dst = out_grad_mat->subMatrix(dst_begin, dst_end - dst_begin);
src->addAtOffset(*dst, j * input_dim);
}
}
}
/**
* \param inputs[0] input grad.
* \param inputs[1] weight grad.
* \param inputs[2] input sequence.
* \param outputs[0] output value.
*/
template <DeviceType Device>
class ContextProjectionBackwardFunc : public FunctionBase {
public:
void init(const FuncConfig& config) override {
context_length_ = config.get<size_t>("context_length");
context_start_ = config.get<int>("context_start");
begin_pad_ = config.get<size_t>("begin_pad");
is_padding_ = config.get<bool>("is_padding");
total_pad_ = config.get<size_t>("total_pad");
}
void calc(const Arguments& inputs,
const Arguments& outputs,
const Arguments& inouts) override {
CHECK_EQ(3, inputs.size());
CHECK_EQ(1, outputs.size());
CHECK_EQ(0, inouts.size());
CHECK(outputs[0].getData() && inputs[2].getData());
CHECK_EQ(outputs[0].dims_.size(), 2);
CHECK_EQ(inputs[0].dims_.size(), 2);
CHECK_EQ(inputs[1].dims_.size(), 2);
CHECK_EQ(inputs[2].dims_.size(), 1);
/// dim of input == dim of weight
CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
/// input and output has the same batch_size
CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
/// dim of output = dim of input * context_length
CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
auto in_grad_mat =
!inputs[0].getData()
? nullptr
: std::make_shared<typename MatrixT<Device>::type>(
inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
auto w_grad_mat =
!inputs[1].getData()
? nullptr
: std::make_shared<typename MatrixT<Device>::type>(
inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
typename SequenceT<Device>::type seq_vec(
inputs[2].dims_[0], reinterpret_cast<int*>(inputs[2].getData()));
ContextProjectionBackward<Device>(out_grad_mat.get(),
in_grad_mat ? in_grad_mat.get() : nullptr,
w_grad_mat ? w_grad_mat.get() : nullptr,
seq_vec,
context_length_,
context_start_,
begin_pad_,
is_padding_,
total_pad_);
}
private:
size_t context_length_;
int context_start_;
size_t begin_pad_;
bool is_padding_;
size_t total_pad_;
};
/**
* \param inputs[0] input grad.
* \param inputs[1] input sequence.
* \param outputs[0] output grad.
*/
template <DeviceType Device>
class ContextProjectionBackwardDataFunc : public FunctionBase {
public:
void init(const FuncConfig& config) override {
context_length_ = config.get<size_t>("context_length");
context_start_ = config.get<int>("context_start");
}
void calc(const Arguments& inputs,
const Arguments& outputs,
const Arguments& inouts) override {
CHECK_EQ(2, inputs.size());
CHECK_EQ(1, outputs.size());
CHECK_EQ(0, inouts.size());
CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
CHECK_EQ(outputs[0].dims_.size(), 2);
CHECK_EQ(inputs[0].dims_.size(), 2);
CHECK_EQ(inputs[1].dims_.size(), 1);
CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
/// input and output has the same batch_size
CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
const auto in_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
typename SequenceT<Device>::type seq_vec(
inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
ContextProjectionBackwardData<Device>(out_grad_mat.get(),
in_grad_mat.get(),
seq_vec,
context_length_,
context_start_);
}
private:
size_t context_length_;
int context_start_;
};
/**
* \param inputs[0] weight grad.
* \param inputs[1] input sequence.
* \param outputs[0] output grad.
*/
template <DeviceType Device>
class ContextProjectionBackwardWeightFunc : public FunctionBase {
public:
void init(const FuncConfig& config) override {
context_length_ = config.get<size_t>("context_length");
context_start_ = config.get<int>("context_start");
begin_pad_ = config.get<size_t>("begin_pad");
total_pad_ = config.get<size_t>("total_pad");
}
void calc(const Arguments& inputs,
const Arguments& outputs,
const Arguments& inouts) override {
CHECK_EQ(2, inputs.size());
CHECK_EQ(1, outputs.size());
CHECK_EQ(0, inouts.size());
CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
CHECK_EQ(outputs[0].dims_.size(), 2);
CHECK_EQ(inputs[0].dims_.size(), 2);
CHECK_EQ(inputs[1].dims_.size(), 1);
CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
auto w_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
typename SequenceT<Device>::type seq_vec(
inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
ContextProjectionBackwardWeight<Device>(out_grad_mat.get(),
w_grad_mat.get(),
seq_vec,
context_length_,
context_start_,
total_pad_,
begin_pad_);
}
private:
size_t context_length_;
int context_start_;
size_t begin_pad_;
size_t total_pad_;
};
REGISTER_TYPED_FUNC(ContextProjectionForward,
CPU,
ContextProjectionForwardFunc);
REGISTER_TYPED_FUNC(ContextProjectionBackward,
CPU,
ContextProjectionBackwardFunc);
#ifndef PADDLE_ONLY_CPU
REGISTER_TYPED_FUNC(ContextProjectionForward,
GPU,
ContextProjectionForwardFunc);
REGISTER_TYPED_FUNC(ContextProjectionBackward,
GPU,
ContextProjectionBackwardFunc);
REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
GPU,
ContextProjectionBackwardDataFunc);
REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
GPU,
ContextProjectionBackwardWeightFunc);
#endif
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "Function.h"
namespace paddle {
/**
* \brief Context Projection Forward.
*
* \param[out] outputs output data.
* \param[in] input input data.
* \param[in] weight input weight.
* \param[in] sequence input data.
* \param[in] context_length consecutive rows for concatenation.
* \param[in] context_start context start position.
* \param[in] begin_pad begining pad position.
* \param[in] is_padding whether padding 0 or not.
*
*/
template <DeviceType Device>
void ContextProjectionForward(typename MatrixT<Device>::type* output,
const typename MatrixT<Device>::type* input,
const typename MatrixT<Device>::type* weight,
const typename SequenceT<Device>::type& sequence,
size_t context_length,
int context_start,
size_t begin_pad);
/**
* \brief Context Projection Backward.
*
* \param[out] outputs output gradient.
* \param[in] input input gradient.
* \param[in] weight input weight gradient.
* \param[in] sequence input data.
* \param[in] context_length consecutive rows for concatenation.
* \param[in] context_start context start position.
* \param[in] begin_pad begining pad position.
* \param[in] is_padding whether padding 0 or not.
*
*/
template <DeviceType Device>
void ContextProjectionBackward(typename MatrixT<Device>::type* out_grad,
typename MatrixT<Device>::type* in_grad,
typename MatrixT<Device>::type* w_grad,
const typename SequenceT<Device>::type& seq_vec,
size_t context_length,
int context_start,
size_t begin_pad,
bool is_padding,
size_t total_pad);
template <DeviceType Device>
void ContextProjectionBackwardData(
typename MatrixT<Device>::type* out_grad,
typename MatrixT<Device>::type* in_grad,
const typename SequenceT<Device>::type& sequence,
size_t context_length,
int context_start);
template <DeviceType Device>
void ContextProjectionBackwardWeight(
typename MatrixT<Device>::type* out_grad,
typename MatrixT<Device>::type* w_grad,
const typename SequenceT<Device>::type& seq_vec,
size_t context_length,
int context_start,
size_t total_pad,
size_t begin_pad);
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "hl_base.h"
#include "ContextProjectionOp.h"
namespace paddle {
template <bool padding>
__global__ void KeContextProjectionForward(const real* input,
const int* sequence,
const real* weight,
real* output,
int input_dim,
int context_length,
int context_start,
int begin_pad) {
int idx = threadIdx.x;
int block_size = blockDim.x;
int sequenceId = blockIdx.x;
int seq_start = sequence[sequenceId];
int seq_end = sequence[sequenceId+1];
real value = 0;
int instances = seq_end - seq_start + context_length - 1;
output += seq_start * input_dim * context_length;
input += seq_start * input_dim;
for (int k = 0; k <= input_dim / block_size; k++) {
if (idx < input_dim) {
for (int i = 0; i < instances; i++) {
// i + context_start;
if ((i + context_start) < 0) {
if (padding) {
value = weight[i * input_dim + idx];
} else {
continue;
}
} else if ((i + context_start) >= (seq_end - seq_start)) {
if (padding) {
value =
weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
input_dim + idx];
} else {
continue;
}
} else {
value = input[(i + context_start) * input_dim + idx];
}
int outx = (i - context_length) < 0 ? i : (context_length - 1);
int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
real* output_r =
output + outy * input_dim * context_length + outx * input_dim;
for (int j = outy; j < seq_end - seq_start; j++) {
output_r[idx] += value;
if (j - outy == outx) break;
output_r += (context_length - 1) * input_dim;
}
}
}
idx += block_size;
}
}
/**
* @brief Context projection forward.
*
* @param[in] input input sequence.
* @param[in] sequence sequence index.
* @param[in] weight padding data.
* @param[out] output output sequence.
* @param[in] num_sequences number of sequences.
* @param[in] input_dim input sequence dimension.
* @param[in] context_length context length.
* @param[in] context_start context start.
* @param[in] begin_pad number of extra timesteps added at the
* beginning.
*
*/
void hl_context_projection_forward(const real* input,
const int* sequence,
const real* weight,
real* output,
size_t num_sequences,
size_t input_dim,
size_t context_length,
int context_start,
size_t begin_pad) {
CHECK_NOTNULL(input);
CHECK_NOTNULL(sequence);
CHECK_NOTNULL(output);
int block_size = 128;
int blocks_x = num_sequences;
int blocks_y = 1;
dim3 threads(block_size, 1);
dim3 grid(blocks_x, blocks_y);
if (weight) {
KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
(input, sequence, weight, output, input_dim,
context_length, context_start, begin_pad);
} else {
KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
(input, sequence, weight, output, input_dim,
context_length, context_start, begin_pad);
}
CHECK_SYNC("hl_context_projection_forward failed");
}
template <>
void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix* output,
const GpuMatrix* input,
const GpuMatrix* weight,
const GpuIVector& sequence,
size_t context_length,
int context_start,
size_t begin_pad) {
CHECK(input && output);
hl_context_projection_forward(input->getData(),
sequence.getData(),
weight ? weight->getData() : nullptr,
output->getData(),
sequence.getSize() - 1,
input->getWidth(),
context_length,
context_start,
begin_pad);
}
__global__ void KeContextProjectionBackwardData(real* out_grad,
const int* sequence,
real* in_grad,
int input_dim,
int context_length,
int context_start) {
int idx = threadIdx.x;
int block_size = blockDim.x;
int sequenceId = blockIdx.x;
int seq_start = sequence[sequenceId];
int seq_end = sequence[sequenceId+1];
real value = 0;
int instances = seq_end - seq_start + context_length - 1;
out_grad += seq_start * input_dim * context_length;
in_grad += seq_start * input_dim;
for (int k = 0; k <= input_dim / block_size; k++) {
if (idx < input_dim) {
for (int i = 0; i < instances; i++) {
if ((i + context_start) < 0) {
continue;
} else if ((i + context_start) >= (seq_end - seq_start)) {
continue;
} else {
// value = 0;
value = in_grad[(i + context_start) * input_dim + idx];
}
int outx = (i - context_length) < 0 ? i : (context_length - 1);
int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
real* output_r =
out_grad + outy * input_dim * context_length + outx * input_dim;
for (int j = outy; j < seq_end - seq_start; j++) {
value += output_r[idx];
if (j - outy == outx) break;
output_r += (context_length - 1) * input_dim;
}
in_grad[(i + context_start) * input_dim + idx] = value;
}
}
idx += block_size;
}
}
/**
* @brief Context projection backward data.
*
* @param[in] out_grad output gradient.
* @param[in] sequence sequence index.
* @param[out] input_grad input gradient.
* @param[in] num_sequences number of sequences.
* @param[in] input_dim input sequence dimension.
* @param[in] context_length context length.
* @param[in] context_start context start.
*
*/
void hl_context_projection_backward_data(real* out_grad,
const int* sequence,
real* input_grad,
size_t num_sequences,
size_t input_dim,
size_t context_length,
int context_start) {
CHECK_NOTNULL(out_grad);
CHECK_NOTNULL(sequence);
CHECK_NOTNULL(input_grad);
int block_size = 128;
int blocks_x = num_sequences;
int blocks_y = 1;
dim3 threads(block_size, 1);
dim3 grid(blocks_x, blocks_y);
KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
(out_grad, sequence, input_grad, input_dim, context_length, context_start);
CHECK_SYNC("hl_context_projection_backward_data failed");
}
template <>
void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix* out_grad,
GpuMatrix* in_grad,
const GpuIVector& sequence,
size_t context_length,
int context_start) {
CHECK(in_grad && out_grad);
hl_context_projection_backward_data(out_grad->getData(),
sequence.getData(),
in_grad->getData(),
sequence.getSize() - 1,
in_grad->getWidth(),
context_length,
context_start);
}
template<int THREADS_X, int THREADS_Y>
__global__ void KeContextProjectionBackwardWeight(real* out_grad,
const int* sequence,
real* w_grad,
int num_sequences,
int w_dim,
int context_length,
int context_start,
int begin_pad) {
__shared__ real sum_s[THREADS_Y][THREADS_X];
int pad_of_block = (w_dim + THREADS_X - 1) / THREADS_X;
const int idx = threadIdx.x;
const int idy = threadIdx.y;
int padId = blockIdx.x / pad_of_block;
int weight_idx = idx + THREADS_X * (blockIdx.x % pad_of_block);
int instanceId;
real value = 0;
real* output_r;
sum_s[idy][idx] = 0.0f;
if (weight_idx < w_dim) {
for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
int seq_start = sequence[seqId];
int seq_end = sequence[seqId+1];
output_r = out_grad + seq_start * w_dim * context_length;
if (context_start < 0) {
if (padId + context_start < 0) {
instanceId = padId;
} else {
// begin_pad > 0;
instanceId = (padId - begin_pad) +
(seq_end - seq_start) - context_start;
}
} else {
if (padId + (seq_end - seq_start) < context_start) {
continue;
} else {
// begin_pad == 0;
instanceId = padId + (seq_end - seq_start) - context_start;
}
}
int outx = (instanceId - context_length) < 0 ?
instanceId : (context_length - 1);
int outy = (instanceId - context_length) < 0 ?
0 : (instanceId - (context_length - 1));
output_r += outy * w_dim * context_length + outx * w_dim;
for (int j = outy; j < seq_end - seq_start; j++) {
value += output_r[weight_idx];
if (j - outy == outx) break;
output_r += (context_length - 1) * w_dim;
}
}
sum_s[idy][idx] = value;
}
__syncthreads();
for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
if (idy < stride) {
sum_s[idy][idx] += sum_s[idy + stride][idx];
}
__syncthreads();
}
__syncthreads();
if (weight_idx < w_dim) {
if (idy == 0) {
w_grad[padId * w_dim + weight_idx] += sum_s[0][idx];
}
}
}
/**
* @brief Context projection backward weight.
*
* @param[in] out_grad output gradient.
* @param[in] sequence sequence index.
* @param[out] w_grad weight gradient.
* @param[in] num_sequences number of sequences.
* @param[in] w_dim input sequence dimension.
* @param[in] total_pad number of extra timesteps.
* @param[in] context_length context length.
* @param[in] context_start context start.
* @param[in] begin_pad number of extra timesteps added at the
* beginning.
*
*/
void hl_context_projection_backward_weight(real* out_grad,
const int* sequence,
real* w_grad,
size_t num_sequences,
size_t w_dim,
size_t total_pad,
size_t context_length,
int context_start,
size_t begin_pad) {
CHECK_NOTNULL(out_grad);
CHECK_NOTNULL(sequence);
CHECK_NOTNULL(w_grad);
int threads_x = 32;
int threads_y = 32;
int blocks_x = total_pad * ((w_dim + threads_x - 1) / threads_x);
dim3 threads(threads_x, threads_y);
dim3 grid(blocks_x, 1);
KeContextProjectionBackwardWeight<32, 32>
<<< grid, threads, 0, STREAM_DEFAULT >>>
(out_grad, sequence, w_grad, num_sequences, w_dim,
context_length, context_start, begin_pad);
CHECK_SYNC("hl_context_projection_backward_weight failed");
}
template <>
void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
GpuMatrix* out_grad,
GpuMatrix* w_grad,
const GpuIVector& seq_vec,
size_t context_length,
int context_start,
size_t total_pad,
size_t begin_pad) {
CHECK(out_grad && w_grad);
hl_context_projection_backward_weight(out_grad->getData(),
seq_vec.getData(),
w_grad->getData(),
seq_vec.getSize() - 1,
w_grad->getWidth(),
total_pad,
context_length,
context_start,
begin_pad);
}
template <>
void ContextProjectionBackward<DEVICE_TYPE_GPU>(GpuMatrix* out_grad,
GpuMatrix* in_grad,
GpuMatrix* w_grad,
const GpuIVector& sequence,
size_t context_length,
int context_start,
size_t begin_pad,
bool is_padding,
size_t total_pad) {
CHECK(out_grad);
if (in_grad) {
ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
out_grad,
in_grad,
sequence,
context_length,
context_start);
}
if (is_padding && w_grad) {
ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
out_grad,
w_grad,
sequence,
context_length,
context_start,
total_pad,
begin_pad);
}
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "FunctionTest.h"
#include "paddle/gserver/tests/TestUtil.h"
#include "paddle/math/Matrix.h"
using namespace paddle; // NOLINT
void testMatrixProjectionForward(int context_start,
size_t context_length,
bool is_padding,
size_t batch_size,
size_t input_dim) {
size_t pad = std::max(0, -context_start) +
std::max(0, (int)(context_start + context_length - 1));
if (pad == 0) is_padding = false;
FunctionCompare compare("ContextProjectionForward",
FuncConfig()
.set("context_length", context_length)
.set("context_start", context_start)
.set("begin_pad", std::max(0, -context_start)));
CpuMatrix cpu_in(batch_size, input_dim);
cpu_in.randomizeUniform();
GpuMatrix gpu_in(batch_size, input_dim);
gpu_in.copyFrom(cpu_in);
auto cpu_weight =
is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr;
auto gpu_weight =
is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
if (is_padding) {
cpu_weight->randomizeUniform();
gpu_weight->copyFrom(*cpu_weight);
}
IVectorPtr cpu_seq;
generateSequenceStartPositions(batch_size, cpu_seq);
IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true);
gpu_seq->copyFrom(*cpu_seq);
CpuMatrix cpu_out(batch_size, input_dim * context_length);
GpuMatrix gpu_out(batch_size, input_dim * context_length);
cpu_out.randomizeUniform();
gpu_out.copyFrom(cpu_out);
compare.getCpuFunction()->calc(
{Tensor(cpu_in.getData(), Dims{batch_size, input_dim}),
Tensor(cpu_weight ? cpu_weight->getData() : nullptr,
Dims{pad, input_dim}),
Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
Dims{cpu_seq->getSize()})},
{Tensor(cpu_out.getData(), Dims{batch_size, input_dim * context_length})},
{});
compare.getGpuFunction()->calc(
{Tensor(gpu_in.getData(), Dims{batch_size, input_dim}),
Tensor(gpu_weight ? gpu_weight->getData() : nullptr,
Dims{pad, input_dim}),
Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
Dims{gpu_seq->getSize()})},
{Tensor(gpu_out.getData(), Dims{batch_size, input_dim * context_length})},
{});
autotest::TensorCheckEqual(cpu_out, gpu_out);
}
void testMatrixProjectionBackward(int context_start,
int context_length,
bool is_padding,
size_t batch_size,
size_t input_dim) {
size_t pad = std::max(0, -context_start) +
std::max(0, (int)(context_start + context_length - 1));
if (pad == 0) is_padding = false;
FunctionCompare compare("ContextProjectionBackward",
FuncConfig()
.set("context_length", context_length)
.set("context_start", context_start)
.set("begin_pad", std::max(0, -context_start))
.set("is_padding", is_padding)
.set("total_pad", pad));
CpuMatrix cpu_in_grad(batch_size, input_dim);
cpu_in_grad.randomizeUniform();
GpuMatrix gpu_in_grad(batch_size, input_dim);
gpu_in_grad.copyFrom(cpu_in_grad);
CpuMatrix cpu_out_grad(batch_size, input_dim * context_length);
cpu_out_grad.randomizeUniform();
GpuMatrix gpu_out_grad(batch_size, input_dim * context_length);
gpu_out_grad.copyFrom(cpu_out_grad);
IVectorPtr cpu_seq;
generateSequenceStartPositions(batch_size, cpu_seq);
IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true);
gpu_seq->copyFrom(*cpu_seq);
auto cpu_w_grad =
is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr;
auto gpu_w_grad =
is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
if (is_padding) {
cpu_w_grad->randomizeUniform();
gpu_w_grad->copyFrom(*cpu_w_grad);
}
compare.getCpuFunction()->calc(
{Tensor(cpu_in_grad.getData(), Dims{batch_size, input_dim}),
Tensor(cpu_w_grad ? cpu_w_grad->getData() : nullptr,
Dims{pad, input_dim}),
Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
Dims{cpu_seq->getSize()})},
{Tensor(cpu_out_grad.getData(),
Dims{batch_size, input_dim * context_length})},
{});
compare.getGpuFunction()->calc(
{Tensor(gpu_in_grad.getData(), Dims{batch_size, input_dim}),
Tensor(gpu_w_grad ? gpu_w_grad->getData() : nullptr,
Dims{pad, input_dim}),
Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
Dims{gpu_seq->getSize()})},
{Tensor(gpu_out_grad.getData(),
Dims{batch_size, input_dim * context_length})},
{});
autotest::TensorCheckErr(cpu_in_grad, gpu_in_grad);
if (is_padding) {
autotest::TensorCheckErr(*cpu_w_grad, *gpu_w_grad);
}
}
TEST(ContextProjection, projection) {
for (auto context_start : {-5, -3, -1, 0, 3}) {
for (auto context_length : {1, 2, 5, 7}) {
for (auto trainable_padding : {false, true}) {
for (auto batch_size : {1, 2, 5, 20, 100}) {
for (auto input_dim : {15, 32, 63, 128, 200}) {
VLOG(3) << " context_start=" << context_start
<< " context_length=" << context_length
<< " trainable_padding=" << trainable_padding
<< " batch_size=" << batch_size
<< " input_dim=" << input_dim;
testMatrixProjectionForward(context_start,
context_length,
trainable_padding,
batch_size,
input_dim);
testMatrixProjectionBackward(context_start,
context_length,
trainable_padding,
batch_size,
input_dim);
}
}
}
}
}
}
......@@ -30,6 +30,20 @@ real FuncConfig::get<real>(const std::string& key) const {
return it->second.r;
}
template <>
int FuncConfig::get<int>(const std::string& key) const {
auto it = valueMap_.find(key);
CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
return it->second.i;
}
template <>
bool FuncConfig::get<bool>(const std::string& key) const {
auto it = valueMap_.find(key);
CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
return it->second.b;
}
template <>
FuncConfig& FuncConfig::set<size_t>(const std::string& key, size_t v) {
CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
......@@ -44,6 +58,20 @@ FuncConfig& FuncConfig::set<real>(const std::string& key, real v) {
return *this;
}
template <>
FuncConfig& FuncConfig::set<int>(const std::string& key, int v) {
CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
valueMap_[key].i = v;
return *this;
}
template <>
FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
valueMap_[key].b = v;
return *this;
}
ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
} // namespace paddle
......@@ -40,6 +40,19 @@ struct MatrixT<DEVICE_TYPE_GPU> {
using type = GpuMatrix;
};
template <DeviceType Device>
struct SequenceT;
template <>
struct SequenceT<DEVICE_TYPE_CPU> {
using type = CpuIVector;
};
template <>
struct SequenceT<DEVICE_TYPE_GPU> {
using type = GpuIVector;
};
typedef std::vector<size_t> Dims;
class Tensor {
......@@ -59,6 +72,8 @@ public:
union value {
size_t s;
real r;
int i;
bool b;
};
template <typename T>
......
......@@ -33,25 +33,33 @@ public:
// init cpu and gpu arguments
auto initArgs = [=](
Arguments& cpuArgs, Arguments& gpuArgs, const Arguments& inArgs) {
for (auto arg : inArgs) {
for (const auto arg : inArgs) {
size_t size = sizeof(real);
for (auto dim : arg.dims_) {
for (const auto dim : arg.dims_) {
size *= dim;
}
cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
cpuArgs.emplace_back(
Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
gpuArgs.emplace_back(
Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
// will use an api to refactor this code.
CpuVector cpuVector(size / sizeof(real),
(real*)cpuArgs.back().getData());
GpuVector gpuVector(size / sizeof(real),
(real*)gpuArgs.back().getData());
cpuVector.uniform(0.001, 1);
gpuVector.copyFrom(cpuVector);
if (arg.getData()) {
// todo(tianbing), waste unnecessary mem here
cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
cpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
gpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
// already init outside
} else {
cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
cpuArgs.emplace_back(
Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
gpuArgs.emplace_back(
Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
// will use an api to refactor this code.
CpuVector cpuVector(size / sizeof(real),
(real*)cpuArgs.back().getData());
GpuVector gpuVector(size / sizeof(real),
(real*)gpuArgs.back().getData());
cpuVector.uniform(0.001, 1);
gpuVector.copyFrom(cpuVector);
}
}
};
initArgs(cpuInputs, gpuInputs, inputs);
......@@ -81,6 +89,10 @@ public:
checkArgs(cpuInouts, gpuInouts);
}
std::shared_ptr<FunctionBase> getCpuFunction() const { return cpu; }
std::shared_ptr<FunctionBase> getGpuFunction() const { return gpu; }
protected:
std::shared_ptr<FunctionBase> cpu;
std::shared_ptr<FunctionBase> gpu;
......
......@@ -38,6 +38,32 @@ ContextProjection::ContextProjection(const ProjectionConfig& config,
CHECK_EQ(inputDim * totalPad, parameter->getSize());
weight_.reset(new Weight(totalPad, inputDim, parameter));
}
// init forward_ and backward_ functions
init();
}
bool ContextProjection::init() {
size_t context_length = config_.context_length();
int context_start = config_.context_start();
bool is_padding = config_.trainable_padding();
size_t total_pad = is_padding ? beginPad_ + endPad_ : 0;
createFunction(forward_,
"ContextProjectionForward",
FuncConfig()
.set("context_length", context_length)
.set("context_start", context_start)
.set("begin_pad", beginPad_));
createFunction(backward_,
"ContextProjectionBackward",
FuncConfig()
.set("context_length", context_length)
.set("context_start", context_start)
.set("begin_pad", beginPad_)
.set("is_padding", is_padding)
.set("total_pad", total_pad));
return true;
}
void ContextProjection::resetState() {
......@@ -78,25 +104,29 @@ LayerStatePtr ContextProjection::getState() {
}
void ContextProjection::forward() {
CHECK(in_->value);
CHECK(in_->value && out_->value);
CHECK(in_->sequenceStartPositions);
auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
int64_t inputDim = in_->value->getWidth();
int64_t dim = out_->value->getWidth();
CHECK_EQ(dim, inputDim * config_.context_length());
size_t input_dim = in_->value->getWidth();
size_t dim = out_->value->getWidth();
CHECK_EQ(dim, input_dim * config_.context_length());
size_t batch_size = in_->value->getHeight();
CHECK_EQ(forward_.size(), 1) << "Only one forward function here";
REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
bool isPadding = config_.trainable_padding();
out_->value->contextProjectionForward(
*(in_->value),
state_ ? state_.get() : isPadding ? weight_->getW().get() : nullptr,
*startPositions,
config_.context_length(),
config_.context_start(),
beginPad_,
state_ ? true : isPadding);
bool is_padding = config_.trainable_padding();
/// first use state_, otherwise use weight_(padding false === w nullptr)
auto w_ptr =
state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
auto start_pos = in_->sequenceStartPositions;
forward_[0]->calc({Tensor(in_->value->getData(), Dims{batch_size, input_dim}),
Tensor(w_ptr ? w_ptr->getData() : nullptr,
Dims{w_ptr ? w_ptr->getHeight() : 0, input_dim}),
Tensor(reinterpret_cast<real*>(
const_cast<int*>(start_pos->getData(useGpu_))),
Dims{start_pos->getSize()})},
{Tensor(out_->value->getData(), Dims{batch_size, dim})},
{});
if (state_ && config_.context_start() < 0) {
CHECK_EQ(1, in_->getNumSequences());
......@@ -118,41 +148,27 @@ void ContextProjection::forward() {
}
void ContextProjection::backward(const UpdateCallback& callback) {
CHECK(in_->value);
int64_t inputDim = in_->value->getWidth();
int64_t dim = out_->value->getWidth();
CHECK_EQ(dim, inputDim * config_.context_length());
auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
CHECK(in_->value && out_->value && out_->grad);
size_t input_dim = in_->value->getWidth();
size_t dim = out_->value->getWidth();
CHECK_EQ(dim, input_dim * config_.context_length());
size_t batch_size = in_->value->getHeight();
CHECK_EQ(batch_size, out_->value->getHeight());
CHECK_EQ(backward_.size(), 1) << "Only one backward function here";
REGISTER_TIMER_INFO("ContextProjectionBackward", getName().c_str());
bool isPadding = config_.trainable_padding();
if (!out_->grad->useGpu()) {
out_->grad->contextProjectionBackward(
in_->grad.get(),
isPadding ? weight_->getWGrad().get() : nullptr,
*startPositions,
config_.context_length(),
config_.context_start(),
beginPad_,
isPadding);
} else {
if (in_->grad) {
out_->grad->contextProjectionBackwardData(*(in_->grad),
*startPositions,
config_.context_length(),
config_.context_start());
}
if (isPadding && weight_->getWGrad()) {
out_->grad->contextProjectionBackwardWeight(
*(weight_->getWGrad()),
*startPositions,
config_.context_length(),
config_.context_start(),
weight_->getWGrad()->getHeight(),
beginPad_);
}
}
bool is_padding = config_.trainable_padding();
auto start_pos = in_->sequenceStartPositions;
auto w_ptr = is_padding ? weight_->getWGrad() : nullptr;
backward_[0]->calc({Tensor(in_->grad ? in_->grad->getData() : nullptr,
Dims{batch_size, input_dim}),
Tensor(w_ptr ? w_ptr->getData() : nullptr,
Dims{w_ptr ? w_ptr->getHeight() : 0, input_dim}),
Tensor(reinterpret_cast<real*>(
const_cast<int*>(start_pos->getData(useGpu_))),
Dims{start_pos->getSize()})},
{Tensor(out_->grad->getData(), Dims{batch_size, dim})},
{});
if (config_.trainable_padding()) {
weight_->getParameterPtr()->incUpdate(callback);
......
......@@ -61,6 +61,8 @@ public:
virtual LayerStatePtr getState();
virtual bool init();
protected:
std::unique_ptr<Weight> weight_;
/// number of extra timesteps added at the beginning
......
......@@ -88,11 +88,37 @@ public:
*/
virtual LayerStatePtr getState() { return nullptr; }
/**
* init forward_ and backward_ functions
*/
virtual bool init() { return true; }
/**
* Get output size of projection.
*/
size_t getOutputSize() const { return config_.output_size(); }
protected:
/**
* Create layer function. Function is called in forward or backward.
* \param function, Layer::forward_ or Layer::backward_
* \param name, function name
* \param config, initialization configuration for the function
*/
void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
const std::string& name,
const FuncConfig& config) {
if (useGpu_) {
function.emplace_back(
FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
} else {
function.emplace_back(
FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
}
auto& func = function.back();
func->init(config);
}
protected:
/// Config of projection
ProjectionConfig config_;
......@@ -106,5 +132,9 @@ protected:
const Argument* out_;
/// Store `passType` passed to forward()
PassType passType_;
/// Layer forward function
std::vector<std::shared_ptr<FunctionBase>> forward_;
/// Layer backward function
std::vector<std::shared_ptr<FunctionBase>> backward_;
};
} // namespace paddle
......@@ -65,9 +65,3 @@ TEST(LinearChainCRF, decoding) {
}
}
}
int main(int argc, char** argv) {
initMain(argc, argv);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
......@@ -730,9 +730,3 @@ TEST(ProtoSequenceDataProvider, test) {
} // end for (int numIdSlots : numSlotsArray)
} // end for (int numSparseNonValueVecSlots : numSlotsArray)
}
int main(int argc, char** argv) {
initMain(argc, argv);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
......@@ -242,9 +242,3 @@ TEST(Layer, WarpCTCLayer) {
}
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
return RUN_ALL_TESTS();
}
......@@ -1304,68 +1304,6 @@ void GpuMatrix::maxSequenceBackward(Matrix& outputGrad,
hl_max_sequence_backward(outGrad, maxIndex, inputGrad, numSequences, dim);
}
void GpuMatrix::contextProjectionForward(Matrix& input,
Matrix* weight,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding) {
CHECK(dynamic_cast<GpuMatrix*>(&input));
CHECK(dynamic_cast<const GpuIVector*>(&sequence));
if (weight) CHECK(dynamic_cast<GpuMatrix*>(weight));
CHECK_EQ(getWidth(), input.getWidth() * contextLength);
hl_context_projection_forward(input.getData(),
sequence.getData(),
isPadding ? weight->getData() : NULL,
getData(),
sequence.getSize() - 1,
input.getWidth(),
contextLength,
contextStart,
beginPad,
isPadding);
}
void GpuMatrix::contextProjectionBackwardData(Matrix& inputGrad,
const IVector& sequence,
int contextLength,
int contextStart) {
CHECK(dynamic_cast<GpuMatrix*>(&inputGrad));
CHECK(dynamic_cast<const GpuIVector*>(&sequence));
CHECK_EQ(getWidth(), inputGrad.getWidth() * contextLength);
hl_context_projection_backward_data(getData(),
sequence.getData(),
inputGrad.getData(),
sequence.getSize() - 1,
inputGrad.getWidth(),
contextLength,
contextStart);
}
void GpuMatrix::contextProjectionBackwardWeight(Matrix& weightGrad,
const IVector& sequence,
int contextLength,
int contextStart,
int totalPad,
size_t beginPad) {
CHECK(dynamic_cast<GpuMatrix*>(&weightGrad));
CHECK(dynamic_cast<const GpuIVector*>(&sequence));
CHECK_EQ(getWidth(), weightGrad.getWidth() * contextLength);
hl_context_projection_backward_weight(getData(),
sequence.getData(),
weightGrad.getData(),
sequence.getSize() - 1,
weightGrad.getWidth(),
totalPad,
contextLength,
contextStart,
beginPad);
}
void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
CHECK(data.useGpu_ == true && W.useGpu_ == true)
<< "Matrix type are not equal";
......@@ -2203,113 +2141,6 @@ void CpuMatrix::maxSequenceBackward(Matrix& outputGrad,
}
}
void CpuMatrix::contextProjectionForward(Matrix& input,
Matrix* weight,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding) {
auto input_ptr = dynamic_cast<CpuMatrix*>(&input);
auto seq_ptr = dynamic_cast<const CpuIVector*>(&sequence);
CHECK(input_ptr && seq_ptr);
if (weight) CHECK(dynamic_cast<CpuMatrix*>(weight));
CHECK_EQ(getWidth(), input_ptr->getWidth() * contextLength);
const int* starts = seq_ptr->getData();
size_t numSequences = seq_ptr->getSize() - 1;
for (size_t i = 0; i < numSequences; ++i) {
for (int j = 0; j < contextLength; ++j) {
int begin = starts[i] + contextStart + j;
int end = starts[i + 1] + contextStart + j;
int dstBegin = starts[i];
int dstEnd = starts[i + 1];
if (begin < starts[i]) {
int64_t padSize =
std::min(starts[i] - begin, starts[i + 1] - starts[i]);
MatrixPtr mat = this->subMatrix(starts[i], padSize);
if (isPadding) {
MatrixPtr sub = weight->subMatrix(j, padSize);
mat->addAtOffset(*sub, j * input_ptr->getWidth());
}
dstBegin = starts[i] + padSize;
begin = starts[i];
}
if (end > starts[i + 1]) {
int64_t padSize =
std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
MatrixPtr mat = this->subMatrix(starts[i + 1] - padSize, padSize);
if (isPadding) {
MatrixPtr sub =
weight->subMatrix(beginPad + contextStart + j - padSize, padSize);
mat->addAtOffset(*sub, j * input_ptr->getWidth());
}
dstEnd = starts[i + 1] - padSize;
end = starts[i + 1];
}
if (end <= begin) continue;
MatrixPtr src = input_ptr->subMatrix(begin, end - begin);
MatrixPtr dst = this->subMatrix(dstBegin, dstEnd - dstBegin);
dst->addAtOffset(*src, j * input_ptr->getWidth());
}
}
}
void CpuMatrix::contextProjectionBackward(Matrix* inputGrad,
Matrix* weightGrad,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding) {
if (inputGrad) CHECK(dynamic_cast<CpuMatrix*>(inputGrad));
if (weightGrad) CHECK(dynamic_cast<CpuMatrix*>(weightGrad));
CHECK(dynamic_cast<const CpuIVector*>(&sequence));
int64_t inputDim = inputGrad ? inputGrad->getWidth()
: weightGrad ? weightGrad->getWidth() : 0;
CHECK_EQ(getWidth(), inputDim * contextLength);
const int* starts = sequence.getData();
size_t numSequences = sequence.getSize() - 1;
for (size_t i = 0; i < numSequences; ++i) {
for (int j = 0; j < contextLength; ++j) {
int begin = starts[i] + contextStart + j;
int end = starts[i + 1] + contextStart + j;
int dstBegin = starts[i];
int dstEnd = starts[i + 1];
if (begin < starts[i]) {
int64_t padSize =
std::min(starts[i] - begin, starts[i + 1] - starts[i]);
if (isPadding && weightGrad) {
MatrixPtr mat = this->subMatrix(starts[i], padSize);
MatrixPtr sub = weightGrad->subMatrix(j, padSize);
sub->addAtOffset(*mat, j * inputDim);
}
dstBegin = starts[i] + padSize;
begin = starts[i];
}
if (end > starts[i + 1]) {
int64_t padSize =
std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
if (isPadding && weightGrad) {
MatrixPtr mat = this->subMatrix(starts[i + 1] - padSize, padSize);
MatrixPtr sub = weightGrad->subMatrix(
beginPad + contextStart + j - padSize, padSize);
sub->addAtOffset(*mat, j * inputDim);
}
dstEnd = starts[i + 1] - padSize;
end = starts[i + 1];
}
if (end <= begin) continue;
if (!inputGrad) continue;
MatrixPtr src = inputGrad->subMatrix(begin, end - begin);
MatrixPtr dst = this->subMatrix(dstBegin, dstEnd - dstBegin);
src->addAtOffset(*dst, j * inputDim);
}
}
}
inline void vecAddTo(real* a, const real* b, size_t len) {
for (unsigned int i = 0; i < len; ++i) {
a[i] += b[i];
......
......@@ -972,42 +972,6 @@ public:
LOG(FATAL) << "Not implemeted";
}
virtual void contextProjectionForward(Matrix& input,
Matrix* weight,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding) {
LOG(FATAL) << "Not implemeted";
}
virtual void contextProjectionBackward(Matrix* inputGrad,
Matrix* weightGrad,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding) {
LOG(FATAL) << "Not implemeted";
}
virtual void contextProjectionBackwardData(Matrix& inputGrad,
const IVector& sequence,
int contextLength,
int contextStart) {
LOG(FATAL) << "Not implemeted";
}
virtual void contextProjectionBackwardWeight(Matrix& weightGrad,
const IVector& sequence,
int contextLength,
int contextStart,
int totalPad,
size_t beginPad) {
LOG(FATAL) << "Not implemeted";
}
/**
* @code
* this.row[i] += table.row[ids[i]]
......@@ -1442,26 +1406,6 @@ public:
const IVector& sequence,
IVector& index);
void contextProjectionForward(Matrix& input,
Matrix* weight,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding);
void contextProjectionBackwardData(Matrix& inputGrad,
const IVector& sequence,
int contextLength,
int contextStart);
void contextProjectionBackwardWeight(Matrix& weightGrad,
const IVector& sequence,
int contextLength,
int contextStart,
int totalPad,
size_t beginPad);
void bilinearForward(const Matrix& in,
const size_t inImgH,
const size_t inImgW,
......@@ -1648,22 +1592,6 @@ public:
const IVector& sequence,
IVector& index);
void contextProjectionForward(Matrix& input,
Matrix* weight,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding);
void contextProjectionBackward(Matrix* inputGrad,
Matrix* weightGrad,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding);
real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
virtual real* getRowBuf(size_t row) { return getRow(row); }
......
......@@ -120,9 +120,3 @@ TEST(MemoryHandle, Gpu) {
}
}
#endif
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
return RUN_ALL_TESTS();
}
......@@ -242,10 +242,4 @@ TEST(BaseMatrix, Other) {
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
paddle::initMain(argc, argv);
return RUN_ALL_TESTS();
}
#endif
......@@ -77,11 +77,4 @@ TEST(CpuGpuVector, subCreate) {
checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
int ret = RUN_ALL_TESTS();
return ret;
}
#endif
......@@ -114,9 +114,3 @@ TEST(ExecViaCpu, test1) {
testWrapper(functor);
}
#endif
int main(int argc, char** argv) {
paddle::initMain(argc, argv);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
......@@ -291,10 +291,4 @@ TEST(Matrix, multiBinaryCrossEntropy) {
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
paddle::initMain(argc, argv);
return RUN_ALL_TESTS();
}
#endif
......@@ -169,9 +169,3 @@ TEST(SIMDFunction, decayL1_WithoutLR) {
ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
}
}
int main(int argc, char** argv) {
paddle::initMain(argc, argv);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
......@@ -561,9 +561,3 @@ TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
checkSMatrixEqual2(matA, matD);
#endif
}
int main(int argc, char** argv) {
paddle::initMain(argc, argv);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
......@@ -1163,11 +1163,3 @@ TEST(Quaternary, CompareOp) {
TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
#endif
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
hl_start();
hl_init(0);
return RUN_ALL_TESTS();
}
......@@ -459,11 +459,3 @@ void testSparseMomentum(size_t size, bool useGpu) {
}
TEST(Training, SparseMomentum) { testCase(testSparseMomentum); }
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
hl_start();
hl_init(FLAGS_gpu_id);
return RUN_ALL_TESTS();
}
......@@ -53,9 +53,3 @@ TEST(MatrixBatchTransTest, test_batch_matrix_transpose) {
checkMatrixEqual(cBatchTransMat, cMat_d2h);
}
#endif
int main(int argc, char** argv) {
paddle::initMain(argc, argv);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
......@@ -139,11 +139,3 @@ TEST(sgdUpdate, GPU) {
testMatrixCase(testSgdUpdate<GpuMatrix>);
}
#endif
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
hl_start();
hl_init(0);
return RUN_ALL_TESTS();
}
......@@ -29,148 +29,6 @@ using namespace std; // NOLINT
using autotest::TensorCheckEqual;
using autotest::TensorCheckErr;
void testMatrixProjectionForward(int contextStart,
int contextLength,
bool padding,
int batchSize,
int inputDim) {
MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
cpuInput->randomizeUniform();
gpuInput->copyFrom(*cpuInput);
int pad = std::max(0, -contextStart) +
std::max(0, contextStart + contextLength - 1);
if (pad == 0) padding = false;
MatrixPtr cpuWeight = nullptr;
MatrixPtr gpuWeight = nullptr;
if (padding) {
cpuWeight = std::make_shared<CpuMatrix>(pad, inputDim);
gpuWeight = std::make_shared<GpuMatrix>(pad, inputDim);
cpuWeight->randomizeUniform();
gpuWeight->copyFrom(*cpuWeight);
}
IVectorPtr cpuSequence;
generateSequenceStartPositions(batchSize, cpuSequence);
IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
gpuSequence->copyFrom(*cpuSequence);
MatrixPtr cpuOutput =
std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
MatrixPtr gpuOutput =
std::make_shared<GpuMatrix>(batchSize, inputDim * contextLength);
cpuOutput->randomizeUniform();
gpuOutput->copyFrom(*cpuOutput);
// calculate
int beginPad = std::max(0, -contextStart);
cpuOutput->contextProjectionForward(*cpuInput,
cpuWeight.get(),
*cpuSequence,
contextLength,
contextStart,
beginPad,
padding);
gpuOutput->contextProjectionForward(*gpuInput,
gpuWeight.get(),
*gpuSequence,
contextLength,
contextStart,
beginPad,
padding);
TensorCheckEqual(*cpuOutput, *gpuOutput);
}
void testMatrixProjectionBackward(int contextStart,
int contextLength,
bool padding,
int batchSize,
int inputDim) {
MatrixPtr cpuOutputGrad =
std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
MatrixPtr gpuOutputGrad =
std::make_shared<GpuMatrix>(batchSize, inputDim * contextLength);
cpuOutputGrad->randomizeUniform();
gpuOutputGrad->copyFrom(*cpuOutputGrad);
IVectorPtr cpuSequence;
generateSequenceStartPositions(batchSize, cpuSequence);
IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
gpuSequence->copyFrom(*cpuSequence);
MatrixPtr cpuInputGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
MatrixPtr gpuInputGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
cpuInputGrad->randomizeUniform();
gpuInputGrad->copyFrom(*cpuInputGrad);
int pad = std::max(0, -contextStart) +
std::max(0, contextStart + contextLength - 1);
if (pad == 0) padding = false;
MatrixPtr cpuWeightGrad = nullptr;
MatrixPtr gpuWeightGrad = nullptr;
if (padding) {
cpuWeightGrad = std::make_shared<CpuMatrix>(pad, inputDim);
gpuWeightGrad = std::make_shared<GpuMatrix>(pad, inputDim);
cpuWeightGrad->randomizeUniform();
gpuWeightGrad->copyFrom(*cpuWeightGrad);
}
// calculate
int beginPad = std::max(0, -contextStart);
cpuOutputGrad->contextProjectionBackward(cpuInputGrad.get(),
cpuWeightGrad.get(),
*cpuSequence,
contextLength,
contextStart,
beginPad,
padding);
gpuOutputGrad->contextProjectionBackwardData(
*gpuInputGrad, *gpuSequence, contextLength, contextStart);
if (padding) {
gpuOutputGrad->contextProjectionBackwardWeight(*gpuWeightGrad,
*gpuSequence,
contextLength,
contextStart,
pad,
beginPad);
}
TensorCheckErr(*cpuInputGrad, *gpuInputGrad);
if (padding) {
TensorCheckErr(*cpuWeightGrad, *gpuWeightGrad);
}
}
TEST(Matrix, projection) {
for (auto contextStart : {-5, -3, -1, 0, 3}) {
for (auto contextLength : {1, 2, 5, 7}) {
for (auto trainablePadding : {false, true}) {
for (auto batchSize : {1, 2, 5, 20, 100}) {
for (auto inputDim : {15, 32, 63, 128, 200}) {
VLOG(3) << " contextStart=" << contextStart
<< " contextLength=" << contextLength
<< " trainablePadding=" << trainablePadding
<< " batchSize=" << batchSize << " inputDim=" << inputDim;
testMatrixProjectionForward(contextStart,
contextLength,
trainablePadding,
batchSize,
inputDim);
testMatrixProjectionBackward(contextStart,
contextLength,
trainablePadding,
batchSize,
inputDim);
}
}
}
}
}
}
void testMatrixMaxSequence(int batchSize, int inputDim) {
// forward
MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
......@@ -1262,10 +1120,4 @@ TEST(Matrix, MaxOutFwdBwd) {
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
return RUN_ALL_TESTS();
}
#endif
......@@ -171,11 +171,4 @@ TEST(SMatrix, sMatrixCollectBias) {
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
int ret = RUN_ALL_TESTS();
return ret;
}
#endif
......@@ -23,15 +23,6 @@ limitations under the License. */
using namespace paddle; // NOLINT
int main(int argc, char** argv) {
paddle::initMain(argc, argv);
testing::InitGoogleTest(&argc, argv);
int ret = RUN_ALL_TESTS();
return ret;
}
class CommonTest : public ::testing::Test {
protected:
CommonTest() : testStat_("test") {}
......
# for paddle test case
if(WITH_TESTING)
add_library(paddle_test_main STATIC TestMain.cpp)
add_dependencies(paddle_test_main gen_proto_cpp)
endif()
......@@ -56,7 +56,7 @@ class RemoteParameterUpdater : public ParameterUpdater {
public:
RemoteParameterUpdater(
const OptimizationConfig& config,
int expectedPpassCount,
int expectedPassCount,
std::unique_ptr<ParameterUpdater>&& localUpdater = nullptr);
~RemoteParameterUpdater() {
if (controllerThread_) {
......@@ -146,7 +146,7 @@ protected:
BatchStatus batchStatus_;
/// controller thread for sync-sgd
std::unique_ptr<std::thread> controllerThread_;
/// passed alread finished
/// passed already finished
int64_t passCount_;
/// expected passes to finished
int64_t expectedPassCount_;
......
......@@ -37,7 +37,7 @@ unsigned int* ThreadLocalRand::getSeed() {
p = new unsigned int(defaultSeed_ - 1);
} else {
p = new unsigned int(defaultSeed_ + getTID());
LOG(INFO) << "thread use undeterministic rand seed:" << *p;
VLOG(3) << "thread use undeterministic rand seed:" << *p;
}
seed_.set(p);
}
......
......@@ -125,7 +125,7 @@ void registerInitFunction(std::function<void()> func, int priority) {
void runInitFunctions() {
std::call_once(g_onceFlag, []() {
LOG(INFO) << "Calling runInitFunctions";
VLOG(3) << "Calling runInitFunctions";
if (g_initFuncs) {
std::sort(g_initFuncs->begin(),
g_initFuncs->end(),
......@@ -139,7 +139,7 @@ void runInitFunctions() {
g_initFuncs = nullptr;
}
g_initialized = true;
LOG(INFO) << "Call runInitFunctions done.";
VLOG(3) << "Call runInitFunctions done.";
});
}
......@@ -231,7 +231,7 @@ std::string join(const std::string& part1, const std::string& part2) {
} // namespace path
void copyFileToPath(const std::string& file, const std::string& dir) {
LOG(INFO) << "copy " << file << " to " << dir;
VLOG(3) << "copy " << file << " to " << dir;
std::string fileName = path::basename(file);
std::string dst = path::join(dir, fileName);
std::ifstream source(file, std::ios_base::binary);
......
......@@ -96,9 +96,3 @@ TEST(CustomStackTrace, normalTest) {
}
});
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
paddle::initMain(argc, argv);
return RUN_ALL_TESTS();
}
......@@ -44,8 +44,3 @@ TEST(SIMDFlags, normalPrint) {
LOG(INFO) << "Has AVX2: " << std::boolalpha << HAS_AVX2;
LOG(INFO) << "Has AVX512: " << std::boolalpha << HAS_AVX512;
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
......@@ -53,9 +53,3 @@ TEST(ThreadSpinLock, normalTest) {
});
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
paddle::initMain(argc, argv);
return RUN_ALL_TESTS();
}
......@@ -79,8 +79,3 @@ TEST(AsyncThreadPool, addBatchJobWithResults) {
ASSERT_EQ(res[i], i);
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
......@@ -64,9 +64,3 @@ TEST(ThreadBarrier, normalTest) {
});
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
paddle::initMain(argc, argv);
return RUN_ALL_TESTS();
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册