提交 4fb0c3bb 编写于 作者: C caoying03

small code cleans.

上级 04907b17
#!/usr/bin/env python
# -*- coding: utf-8 -*-import os
import sys
import csv
import cPickle
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import gzip
import argparse
import itertools
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import paddle.v2 as paddle
from paddle.v2 import layer
from paddle.v2 import data_type as dtype
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-import os
import argparse
import gzip
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import itertools
......@@ -32,9 +30,10 @@ parser.add_argument(
type=int,
required=True,
default=ModelType.CLASSIFICATION_MODE,
help="model type, %d for classification, %d for pairwise rank, %d for regression (default: classification)"
% (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE,
ModelType.REGRESSION_MODE))
help=("model type, %d for classification, %d for pairwise rank, "
"%d for regression (default: classification)") %
(ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE,
ModelType.REGRESSION_MODE))
parser.add_argument(
'-s',
'--source_dic_path',
......@@ -45,8 +44,8 @@ parser.add_argument(
'--target_dic_path',
type=str,
required=False,
help="path of the target's word dic, if not set, the `source_dic_path` will be used"
)
help=("path of the target's word dictionary, "
"if not set, the `source_dic_path` will be used"))
parser.add_argument(
'-a',
'--model_arch',
......@@ -69,8 +68,9 @@ parser.add_argument(
'--dnn_dims',
type=str,
default='256,128,64,32',
help="dimentions of dnn layers, default is '256,128,64,32', which means create a 4-layer dnn, demention of each layer is 256, 128, 64 and 32"
)
help=("dimentions of dnn layers, default is '256,128,64,32', "
"which means create a 4-layer dnn, "
"demention of each layer is 256, 128, 64 and 32"))
parser.add_argument(
'-c',
'--class_num',
......@@ -85,7 +85,8 @@ if args.model_type.is_classification():
assert args.class_num > 1, "--class_num should be set in classification task."
layer_dims = map(int, args.dnn_dims.split(','))
args.target_dic_path = args.source_dic_path if not args.target_dic_path else args.target_dic_path
args.target_dic_path = args.source_dic_path if not args.target_dic_path \
else args.target_dic_path
paddle.init(use_gpu=False, trainer_count=1)
......@@ -130,9 +131,9 @@ class Inferer(object):
for id, batch in enumerate(infer_reader()):
res = self.inferer.infer(input=batch)
predictions = [' '.join(map(str, x)) for x in res]
assert len(batch) == len(
predictions), "predict error, %d inputs, but %d predictions" % (
len(batch), len(predictions))
assert len(batch) == len(predictions), (
"predict error, %d inputs, "
"but %d predictions") % (len(batch), len(predictions))
output_f.write('\n'.join(map(str, predictions)) + '\n')
......
......@@ -29,9 +29,9 @@ class DSSM(object):
@class_num: int
number of categories.
'''
assert len(
vocab_sizes
) == 2, "vocab_sizes specify the sizes left and right inputs, and dim should be 2."
assert len(vocab_sizes) == 2, (
"vocab_sizes specify the sizes left and right inputs, "
"and dim should be 2.")
assert len(dnn_dims) > 1, "more than two layers is needed."
self.dnn_dims = dnn_dims
......@@ -91,7 +91,8 @@ class DSSM(object):
@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between more than one `fc` parts.
prefix of layers' names, used to share parameters between
more than one `fc` parts.
'''
_input_layer = paddle.layer.pooling(
input=emb, pooling_type=paddle.pooling.Max())
......@@ -113,7 +114,8 @@ class DSSM(object):
@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between more than one `cnn` parts.
prefix of layers' names, used to share parameters between
more than one `cnn` parts.
'''
def create_conv(context_len, hidden_size, prefix):
......@@ -174,7 +176,8 @@ class DSSM(object):
- source sentence
- left_target sentence
- right_target sentence
- label, 1 if left_target should be sorted in front of right_target, otherwise 0.
- label, 1 if left_target should be sorted in front of
right_target, otherwise 0.
'''
logger.info("build rank model")
assert self.model_type.is_rank()
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from utils import UNK, ModelType, TaskType, load_dic, sent2ids, logger, ModelType
from utils import UNK, ModelType, TaskType, load_dic, \
sent2ids, logger, ModelType
class Dataset(object):
......@@ -38,7 +37,6 @@ class Dataset(object):
'''
Load testset.
'''
# logger.info("[reader] load testset from %s" % self.test_path)
with open(self.test_path) as f:
for line_id, line in enumerate(f):
yield self.record_reader(line)
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import paddle.v2 as paddle
......@@ -31,8 +29,8 @@ parser.add_argument(
'--target_dic_path',
type=str,
required=False,
help="path of the target's word dic, if not set, the `source_dic_path` will be used"
)
help=("path of the target's word dictionary, "
"if not set, the `source_dic_path` will be used"))
parser.add_argument(
'-b',
'--batch_size',
......@@ -221,7 +219,8 @@ def train(train_data_path=None,
event.pass_id, event.batch_id, event.cost, event.metrics))
# test model
if event.batch_id > 0 and event.batch_id % args.num_batches_to_test == 0:
if event.batch_id > 0 and \
event.batch_id % args.num_batches_to_test == 0:
if test_reader is not None:
if model_type.is_classification():
result = trainer.test(
......@@ -231,7 +230,8 @@ def train(train_data_path=None,
else:
result = None
# save model
if event.batch_id > 0 and event.batch_id % args.num_batches_to_save_model == 0:
if event.batch_id > 0 and \
event.batch_id % args.num_batches_to_save_model == 0:
model_desc = "{type}_{arch}".format(
type=str(args.model_type), arch=str(args.model_arch))
with open("%sdssm_%s_pass_%05d.tar" %
......
<html>
<head>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'] ],
displayMath: [ ['$$','$$'] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
<script type="text/javascript" src="../.tools/theme/marked.js">
</script>
<link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
<script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
<link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
<link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
<link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
</head>
<style type="text/css" >
.markdown-body {
box-sizing: border-box;
min-width: 200px;
max-width: 980px;
margin: 0 auto;
padding: 45px;
}
</style>
<body>
<div id="context" class="container-fluid markdown-body">
</div>
<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
<div id="markdown" style='display:none'>
[TBD]
</div>
<!-- You can change the lines below now. -->
<script type="text/javascript">
marked.setOptions({
renderer: new marked.Renderer(),
gfm: true,
breaks: false,
smartypants: true,
highlight: function(code, lang) {
code = code.replace(/&amp;/g, "&")
code = code.replace(/&gt;/g, ">")
code = code.replace(/&lt;/g, "<")
code = code.replace(/&nbsp;/g, " ")
return hljs.highlightAuto(code, [lang]).value;
}
});
document.getElementById("context").innerHTML = marked(
document.getElementById("markdown").innerHTML)
</script>
</body>
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import logging
import gzip
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import math
import paddle.v2 as paddle
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import logging
import gzip
......
文件模式从 100755 更改为 100644
import os, sys
import os
import sys
import gzip
import paddle.v2 as paddle
import numpy as np
import functools
import argparse
import numpy as np
import paddle.v2 as paddle
def lambda_rank(input_dim):
"""
lambda_rank is a Listwise rank model, the input data and label must be sequences.
lambda_rank is a Listwise rank model, the input data and label
must be sequences.
https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf
parameters :
input_dim, one document's dense feature vector dimension
......@@ -16,6 +20,7 @@ def lambda_rank(input_dim):
format of the dense_vector_sequence:
[[f, ...], [f, ...], ...], f is a float or an int number
"""
label = paddle.layer.data("label",
paddle.data_type.dense_vector_sequence(1))
data = paddle.layer.data("data",
......@@ -88,11 +93,11 @@ def train_lambda_rank(num_passes):
def lambda_rank_infer(pass_id):
"""lambda_rank model inference interface
parameters:
pass_id : inference model in pass_id
"""
lambda_rank model inference interface
parameters:
pass_id : inference model in pass_id
"""
print "Begin to Infer..."
input_dim = 46
output = lambda_rank(input_dim)
......@@ -109,7 +114,8 @@ def lambda_rank_infer(pass_id):
if len(infer_data) == infer_data_num:
break
# predict score of infer_data document. Re-sort the document base on predict score
# predict score of infer_data document.
# Re-sort the document base on predict score
# in descending order. then we build the ranking documents
predicitons = paddle.infer(
output_layer=output, parameters=parameters, input=infer_data)
......
......@@ -12,7 +12,6 @@ def ndcg(score_list):
e.g. predict rank score list :
>>> scores = [3, 2, 3, 0, 1, 2]
>>> ndcg_score = ndcg(scores)
"""
def dcg(score_list):
......
......@@ -13,11 +13,11 @@ import argparse
def half_ranknet(name_prefix, input_dim):
"""
parameter in same name will be shared in paddle framework,
these parameters in ranknet can be used in shared state, e.g. left network and right network
shared parameters in detail
https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md
"""
parameter in same name will be shared in paddle framework,
these parameters in ranknet can be used in shared state,
e.g. left network and right network shared parameters in detail
https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md
"""
# data layer
data = paddle.layer.data(name_prefix + "/data",
paddle.data_type.dense_vector(input_dim))
......@@ -102,12 +102,14 @@ def ranknet_infer(pass_id):
print "Begin to Infer..."
feature_dim = 46
# we just need half_ranknet to predict a rank score, which can be used in sort documents
# we just need half_ranknet to predict a rank score,
# which can be used in sort documents
output = half_ranknet("infer", feature_dim)
parameters = paddle.parameters.Parameters.from_tar(
gzip.open("ranknet_params_%d.tar.gz" % (pass_id)))
# load data of same query and relevance documents, need ranknet to rank these candidates
# load data of same query and relevance documents,
# need ranknet to rank these candidates
infer_query_id = []
infer_data = []
infer_doc_index = []
......@@ -121,7 +123,8 @@ def ranknet_infer(pass_id):
infer_query_id.append(query_id)
infer_data.append([feature_vector])
# predict score of infer_data document. Re-sort the document base on predict score
# predict score of infer_data document.
# Re-sort the document base on predict score
# in descending order. then we build the ranking documents
scores = paddle.infer(
output_layer=output, parameters=parameters, input=infer_data)
......
......@@ -23,7 +23,7 @@ class ExternalMemory(object):
Besides, the ExternalMemory class must be used together with
paddle.layer.recurrent_group (within its step function). It can never be
used in a standalone manner.
For more details, please refer to
`Neural Turing Machines <https://arxiv.org/abs/1410.5401>`_.
......
"""
"""
Contains model configuration for external-memory-enhanced seq2seq.
The "external memory" refers to two types of memories.
......
#!/usr/bin/env python
# -*- encoding:utf-8 -*-
import os
import gzip
import numpy as np
......
#!/usr/bin/env python
# -*- encoding:utf-8 -*-
import math
import paddle.v2 as paddle
......
#!/usr/bin/env python
# -*- encoding:utf-8 -*-
import os
import logging
import gzip
......
#!/usr/bin/env python
import os
import logging
import numpy as np
......
#!/usr/bin/env python
import paddle.v2 as paddle
import sys
import gzip
......
#!/usr/bin/env python
import os
import logging
import paddle.v2 as paddle
......
......@@ -30,7 +30,8 @@ class RandomScheduleGenerator:
def getScheduleRate(self):
"""
Get the schedule sampling rate. Usually not needed to be called by the users
Get the schedule sampling rate. Usually not needed to be
called by the users.
"""
return self.schedule_computer(self.a, self.b, self.data_processed_)
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import os
import gzip
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import gzip
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
import os
import argparse
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册