diff --git a/06.understand_sentiment/README.md b/06.understand_sentiment/README.md index f682f30f6123df13022e933039ab54c52c2318a6..f427f134de8ee0e3c9844869bb1961b6235b6c20 100644 --- a/06.understand_sentiment/README.md +++ b/06.understand_sentiment/README.md @@ -98,18 +98,21 @@ We use [IMDB](http://ai.stanford.edu/%7Eamaas/data/sentiment/) dataset for senti After issuing a command `python train.py`, training will start immediately. The details will be unpacked by the following sessions to see how it works. -## Model Structure +## Model Configuration -### Initialize PaddlePaddle - -We must import and initialize PaddlePaddle (enable/disable GPU, set the number of trainers, etc). +Our program starts with importing necessary packages and initializing some global variables: ```python -import sys -import paddle.v2 as paddle - -# PaddlePaddle init -paddle.init(use_gpu=False, trainer_count=1) +import paddle +import paddle.fluid as fluid +from functools import partial +import numpy as np + +CLASS_DIM = 2 +EMB_DIM = 128 +HID_DIM = 512 +BATCH_SIZE = 128 +USE_GPU = False ``` As alluded to in section [Model Overview](#model-overview), here we provide the implementations of both Text CNN and Stacked-bidirectional LSTM models. @@ -118,212 +121,229 @@ As alluded to in section [Model Overview](#model-overview), here we provide the We create a neural network `convolution_net` as the following snippet code. -Note: `paddle.networks.sequence_conv_pool` includes both convolution and pooling layer operations. +Note: `fluid.nets.sequence_conv_pool` includes both convolution and pooling layer operations. ```python -def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128): - data = paddle.layer.data("word", - paddle.data_type.integer_value_sequence(input_dim)) - emb = paddle.layer.embedding(input=data, size=emb_dim) - conv_3 = paddle.networks.sequence_conv_pool( - input=emb, context_len=3, hidden_size=hid_dim) - conv_4 = paddle.networks.sequence_conv_pool( - input=emb, context_len=4, hidden_size=hid_dim) - output = paddle.layer.fc(input=[conv_3, conv_4], - size=class_dim, - act=paddle.activation.Softmax()) - lbl = paddle.layer.data("label", paddle.data_type.integer_value(2)) - cost = paddle.layer.classification_cost(input=output, label=lbl) - return cost, output -``` - -1. Define input data and its dimension +def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim): + emb = fluid.layers.embedding( + input=data, size=[input_dim, emb_dim], is_sparse=True) + conv_3 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=hid_dim, + filter_size=3, + act="tanh", + pool_type="sqrt") + conv_4 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=hid_dim, + filter_size=4, + act="tanh", + pool_type="sqrt") + prediction = fluid.layers.fc( + input=[conv_3, conv_4], size=class_dim, act="softmax") + return prediction - Parameter `input_dim` denotes the dictionary size, and `class_dim` is the number of categories. In `convolution_net`, the input to the network is defined in `paddle.layer.data`. - -1. Define Classifier - - The above Text CNN network extracts high-level features and maps them to a vector of the same size as the categories. `paddle.activation.Softmax` function or classifier is then used for calculating the probability of the sentence belonging to each category. +``` +Parameter `input_dim` denotes the dictionary size, and `class_dim` is the number of categories. -1. Define Loss Function +The above Text CNN network extracts high-level features and maps them to a vector of the same size as the categories. `paddle.activation.Softmax` function or classifier is then used for calculating the probability of the sentence belonging to each category. - In the context of supervised learning, labels of the training set are defined in `paddle.layer.data`, too. During training, cross-entropy is used as loss function in `paddle.layer.classification_cost` and as the output of the network; During testing, the outputs are the probabilities calculated in the classifier. -#### Stacked bidirectional LSTM +### Stacked bidirectional LSTM We create a neural network `stacked_lstm_net` as below. ```python -def stacked_lstm_net(input_dim, - class_dim=2, - emb_dim=128, - hid_dim=512, - stacked_num=3): - """ - A Wrapper for sentiment classification task. - This network uses a bi-directional recurrent network, - consisting of three LSTM layers. This configuration is - motivated from the following paper, but uses few layers. - http://www.aclweb.org/anthology/P15-1109 - input_dim: here is word dictionary dimension. - class_dim: number of categories. - emb_dim: dimension of word embedding. - hid_dim: dimension of hidden layer. - stacked_num: number of stacked lstm-hidden layer. - """ - assert stacked_num % 2 == 1 - - fc_para_attr = paddle.attr.Param(learning_rate=1e-3) - lstm_para_attr = paddle.attr.Param(initial_std=0., learning_rate=1.) - para_attr = [fc_para_attr, lstm_para_attr] - bias_attr = paddle.attr.Param(initial_std=0., l2_rate=0.) - relu = paddle.activation.Relu() - linear = paddle.activation.Linear() - - data = paddle.layer.data("word", - paddle.data_type.integer_value_sequence(input_dim)) - emb = paddle.layer.embedding(input=data, size=emb_dim) - - fc1 = paddle.layer.fc(input=emb, - size=hid_dim, - act=linear, - bias_attr=bias_attr) - lstm1 = paddle.layer.lstmemory( - input=fc1, act=relu, bias_attr=bias_attr) +def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num): + + emb = fluid.layers.embedding( + input=data, size=[input_dim, emb_dim], is_sparse=True) + + fc1 = fluid.layers.fc(input=emb, size=hid_dim) + lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim) inputs = [fc1, lstm1] + for i in range(2, stacked_num + 1): - fc = paddle.layer.fc(input=inputs, - size=hid_dim, - act=linear, - param_attr=para_attr, - bias_attr=bias_attr) - lstm = paddle.layer.lstmemory( - input=fc, - reverse=(i % 2) == 0, - act=relu, - bias_attr=bias_attr) + fc = fluid.layers.fc(input=inputs, size=hid_dim) + lstm, cell = fluid.layers.dynamic_lstm( + input=fc, size=hid_dim, is_reverse=(i % 2) == 0) inputs = [fc, lstm] - fc_last = paddle.layer.pooling( - input=inputs[0], pooling_type=paddle.pooling.Max()) - lstm_last = paddle.layer.pooling( - input=inputs[1], pooling_type=paddle.pooling.Max()) - output = paddle.layer.fc(input=[fc_last, lstm_last], - size=class_dim, - act=paddle.activation.Softmax(), - bias_attr=bias_attr, - param_attr=para_attr) - - lbl = paddle.layer.data("label", paddle.data_type.integer_value(2)) - cost = paddle.layer.classification_cost(input=output, label=lbl) - return cost, output -``` + fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max') + lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max') -1. Define input data and its dimension + prediction = fluid.layers.fc(input=[fc_last, lstm_last], + size=class_dim, + act='softmax') + return prediction - Parameter `input_dim` denotes the dictionary size, and `class_dim` is the number of categories. In `stacked_lstm_net`, the input to the network is defined in `paddle.layer.data`. +``` +The above stacked bidirectional LSTM network extracts high-level features and maps them to a vector of the same size as the categories. `paddle.activation.Softmax` function or classifier is then used for calculating the probability of the sentence belonging to each category. -1. Define Classifier +To reiterate, we can either invoke `convolution_net` or `stacked_lstm_net`. In below steps, we will go with `convolution_net`. - The above stacked bidirectional LSTM network extracts high-level features and maps them to a vector of the same size as the categories. `paddle.activation.Softmax` function or classifier is then used for calculating the probability of the sentence belonging to each category. +Next we define an `inference_program` that simply uses `convolution_net` to predict output with the input from `fluid.layer.data`. -1. Define Loss Function +```python +def inference_program(word_dict): + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + + dict_dim = len(word_dict) + net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM) + return net +``` - In the context of supervised learning, labels of the training set are defined in `paddle.layer.data`, too. During training, cross-entropy is used as loss function in `paddle.layer.classification_cost` and as the output of the network; During testing, the outputs are the probabilities calculated in the classifier. +Then we define a `training_program` that uses the result from `inference_program` to compute the cost with label data. +Also define `optimizer_func` to specify the optimizer. -To reiterate, we can either invoke `convolution_net` or `stacked_lstm_net`. +In the context of supervised learning, labels of the training set are defined in `paddle.layer.data` too. During training, cross-entropy is used as loss function in `paddle.layer.classification_cost` and as the output of the network; During testing, the outputs are the probabilities calculated in the classifier. +First result that returns from the list must be cost. ```python -word_dict = paddle.dataset.imdb.word_dict() -dict_dim = len(word_dict) -class_dim = 2 +def train_program(word_dict): + prediction = inference_program(word_dict) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(cost) + accuracy = fluid.layers.accuracy(input=prediction, label=label) + return [avg_cost, accuracy] + -# option 1 -[cost, output] = convolution_net(dict_dim, class_dim=class_dim) -# option 2 -# [cost, output] = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3) +def optimizer_func(): + return fluid.optimizer.Adagrad(learning_rate=0.002) ``` ## Model Training -### Define Parameters +### Specify training environment -First, we create the model parameters according to the previous model configuration `cost`. +Specify your training environment, you should specify if the training is on CPU or GPU. ```python -# create parameters -parameters = paddle.parameters.create(cost) +use_cuda = False +place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() ``` -### Create Trainer +### Datafeeder Configuration -Before jumping into creating a training module, algorithm setting is also necessary. -Here we specified `Adam` optimization algorithm via `paddle.optimizer`. +Next we define data feeders for test and train. The feeder reads a `buf_size` of data each time and feed them to the training/testing process. +`paddle.dataset.imdb.train` will yield records during each pass, after shuffling, a batch input of `BATCH_SIZE` is generated for training. -```python -# create optimizer -adam_optimizer = paddle.optimizer.Adam( - learning_rate=2e-3, - regularization=paddle.optimizer.L2Regularization(rate=8e-4), - model_average=paddle.optimizer.ModelAverage(average_window=0.5)) - -# create trainer -trainer = paddle.trainer.SGD(cost=cost, - parameters=parameters, - update_equation=adam_optimizer) -``` +Notice for loading and reading IMDB data, it could take up to 1 minute. Please be patient. -### Training +```python -`paddle.dataset.imdb.train()` will yield records during each pass, after shuffling, a batch input is generated for training. +print("Loading IMDB word dict....") +word_dict = paddle.dataset.imdb.word_dict() -```python +print ("Reading training data....") train_reader = paddle.batch( paddle.reader.shuffle( - lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000), - batch_size=100) + paddle.dataset.imdb.train(word_dict), buf_size=25000), + batch_size=BATCH_SIZE) +``` -test_reader = paddle.batch( - lambda: paddle.dataset.imdb.test(word_dict), batch_size=100) + +### Create Trainer + +Create a trainer that takes `train_program` as input and specify optimizer function. + +```python +trainer = fluid.Trainer( + train_func=partial(train_program, word_dict), + place=place, + optimizer_func=optimizer_func) ``` -`feeding` is devoted to specifying the correspondence between each yield record and `paddle.layer.data`. For instance, the first column of data generated by `paddle.dataset.imdb.train()` corresponds to `word` feature. +### Feeding Data + +`feed_order` is devoted to specifying the correspondence between each yield record and `paddle.layer.data`. For instance, the first column of data generated by `imdb.train` corresponds to `words`. ```python -feeding = {'word': 0, 'label': 1} +feed_order = ['words', 'label'] ``` -Callback function `event_handler` will be invoked to track training progress when a pre-defined event happens. +### Event Handler + +Callback function `event_handler` will be called during training when a pre-defined event happens. +For example, we can check the cost by `trainer.test` when `EndStepEvent` occurs ```python +# Specify the directory path to save the parameters +params_dirname = "understand_sentiment_conv.inference.model" + def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - print "\nPass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) - else: - sys.stdout.write('.') - sys.stdout.flush() - if isinstance(event, paddle.event.EndPass): - with open('./params_pass_%d.tar' % event.pass_id, 'w') as f: - trainer.save_parameter_to_tar(f) - - result = trainer.test(reader=test_reader, feeding=feeding) - print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) + if isinstance(event, fluid.EndStepEvent): + print("Step {0}, Epoch {1} Metrics {2}".format( + event.step, event.epoch, map(np.array, event.metrics))) + + if event.step == 10: + trainer.save_params(params_dirname) + trainer.stop() ``` -Finally, we can invoke `trainer.train` to start training: +### Training + +Finally, we invoke `trainer.train` to start training with `num_epochs` and other parameters. ```python trainer.train( - reader=train_reader, + num_epochs=1, event_handler=event_handler, - feeding=feeding, - num_passes=10) + reader=train_reader, + feed_order=feed_order) ``` +## Inference + +### Create Inferencer + +Initialize Inferencer with `inference_program` and `params_dirname` which is where we save params from training. + +```python +inferencer = fluid.Inferencer( + infer_func=partial(inference_program, word_dict), + param_path=params_dirname, + place=place) +``` + +### Create Lod Tensor with test data + +To do inference, we pick 3 potential reviews out of our mind as testing data. Feel free to modify any of them. +We map each word in the reviews to id from `word_dict`, replaced by 'unknown' if the word is not in `word_dict`. +Then we create lod data with the id list and use `create_lod_tensor` to create lod tensor. + +```python +reviews_str = [ + 'read the book forget the movie', 'this is a great movie', 'this is very bad' +] +reviews = [c.split() for c in reviews_str] + +UNK = word_dict[''] +lod = [] +for c in reviews: + lod.append([word_dict.get(words, UNK) for words in c]) + +base_shape = [[len(c) for c in lod]] + +tensor_words = fluid.create_lod_tensor(lod, base_shape, place) +``` + +### Infer + +Now we can infer and predict probability of positive or negative from each review above. + +```python +results = inferencer.infer({'words': tensor_words}) + +for i, r in enumerate(results[0]): + print("Predict probability of ", r[0], " to be positive and ", r[1], " to be negative for review \'", reviews_str[i], "\'") + + +``` ## Conclusion diff --git a/06.understand_sentiment/index.html b/06.understand_sentiment/index.html index 76b4eb382fac33d7a6a9deafff2a897d0000b13a..54f0a5b2dd4cd403074f92a967932d081396c303 100644 --- a/06.understand_sentiment/index.html +++ b/06.understand_sentiment/index.html @@ -140,18 +140,21 @@ We use [IMDB](http://ai.stanford.edu/%7Eamaas/data/sentiment/) dataset for senti After issuing a command `python train.py`, training will start immediately. The details will be unpacked by the following sessions to see how it works. -## Model Structure +## Model Configuration -### Initialize PaddlePaddle - -We must import and initialize PaddlePaddle (enable/disable GPU, set the number of trainers, etc). +Our program starts with importing necessary packages and initializing some global variables: ```python -import sys -import paddle.v2 as paddle - -# PaddlePaddle init -paddle.init(use_gpu=False, trainer_count=1) +import paddle +import paddle.fluid as fluid +from functools import partial +import numpy as np + +CLASS_DIM = 2 +EMB_DIM = 128 +HID_DIM = 512 +BATCH_SIZE = 128 +USE_GPU = False ``` As alluded to in section [Model Overview](#model-overview), here we provide the implementations of both Text CNN and Stacked-bidirectional LSTM models. @@ -160,212 +163,229 @@ As alluded to in section [Model Overview](#model-overview), here we provide the We create a neural network `convolution_net` as the following snippet code. -Note: `paddle.networks.sequence_conv_pool` includes both convolution and pooling layer operations. +Note: `fluid.nets.sequence_conv_pool` includes both convolution and pooling layer operations. ```python -def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128): - data = paddle.layer.data("word", - paddle.data_type.integer_value_sequence(input_dim)) - emb = paddle.layer.embedding(input=data, size=emb_dim) - conv_3 = paddle.networks.sequence_conv_pool( - input=emb, context_len=3, hidden_size=hid_dim) - conv_4 = paddle.networks.sequence_conv_pool( - input=emb, context_len=4, hidden_size=hid_dim) - output = paddle.layer.fc(input=[conv_3, conv_4], - size=class_dim, - act=paddle.activation.Softmax()) - lbl = paddle.layer.data("label", paddle.data_type.integer_value(2)) - cost = paddle.layer.classification_cost(input=output, label=lbl) - return cost, output -``` - -1. Define input data and its dimension +def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim): + emb = fluid.layers.embedding( + input=data, size=[input_dim, emb_dim], is_sparse=True) + conv_3 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=hid_dim, + filter_size=3, + act="tanh", + pool_type="sqrt") + conv_4 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=hid_dim, + filter_size=4, + act="tanh", + pool_type="sqrt") + prediction = fluid.layers.fc( + input=[conv_3, conv_4], size=class_dim, act="softmax") + return prediction - Parameter `input_dim` denotes the dictionary size, and `class_dim` is the number of categories. In `convolution_net`, the input to the network is defined in `paddle.layer.data`. - -1. Define Classifier - - The above Text CNN network extracts high-level features and maps them to a vector of the same size as the categories. `paddle.activation.Softmax` function or classifier is then used for calculating the probability of the sentence belonging to each category. +``` +Parameter `input_dim` denotes the dictionary size, and `class_dim` is the number of categories. -1. Define Loss Function +The above Text CNN network extracts high-level features and maps them to a vector of the same size as the categories. `paddle.activation.Softmax` function or classifier is then used for calculating the probability of the sentence belonging to each category. - In the context of supervised learning, labels of the training set are defined in `paddle.layer.data`, too. During training, cross-entropy is used as loss function in `paddle.layer.classification_cost` and as the output of the network; During testing, the outputs are the probabilities calculated in the classifier. -#### Stacked bidirectional LSTM +### Stacked bidirectional LSTM We create a neural network `stacked_lstm_net` as below. ```python -def stacked_lstm_net(input_dim, - class_dim=2, - emb_dim=128, - hid_dim=512, - stacked_num=3): - """ - A Wrapper for sentiment classification task. - This network uses a bi-directional recurrent network, - consisting of three LSTM layers. This configuration is - motivated from the following paper, but uses few layers. - http://www.aclweb.org/anthology/P15-1109 - input_dim: here is word dictionary dimension. - class_dim: number of categories. - emb_dim: dimension of word embedding. - hid_dim: dimension of hidden layer. - stacked_num: number of stacked lstm-hidden layer. - """ - assert stacked_num % 2 == 1 - - fc_para_attr = paddle.attr.Param(learning_rate=1e-3) - lstm_para_attr = paddle.attr.Param(initial_std=0., learning_rate=1.) - para_attr = [fc_para_attr, lstm_para_attr] - bias_attr = paddle.attr.Param(initial_std=0., l2_rate=0.) - relu = paddle.activation.Relu() - linear = paddle.activation.Linear() - - data = paddle.layer.data("word", - paddle.data_type.integer_value_sequence(input_dim)) - emb = paddle.layer.embedding(input=data, size=emb_dim) - - fc1 = paddle.layer.fc(input=emb, - size=hid_dim, - act=linear, - bias_attr=bias_attr) - lstm1 = paddle.layer.lstmemory( - input=fc1, act=relu, bias_attr=bias_attr) +def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num): + + emb = fluid.layers.embedding( + input=data, size=[input_dim, emb_dim], is_sparse=True) + + fc1 = fluid.layers.fc(input=emb, size=hid_dim) + lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim) inputs = [fc1, lstm1] + for i in range(2, stacked_num + 1): - fc = paddle.layer.fc(input=inputs, - size=hid_dim, - act=linear, - param_attr=para_attr, - bias_attr=bias_attr) - lstm = paddle.layer.lstmemory( - input=fc, - reverse=(i % 2) == 0, - act=relu, - bias_attr=bias_attr) + fc = fluid.layers.fc(input=inputs, size=hid_dim) + lstm, cell = fluid.layers.dynamic_lstm( + input=fc, size=hid_dim, is_reverse=(i % 2) == 0) inputs = [fc, lstm] - fc_last = paddle.layer.pooling( - input=inputs[0], pooling_type=paddle.pooling.Max()) - lstm_last = paddle.layer.pooling( - input=inputs[1], pooling_type=paddle.pooling.Max()) - output = paddle.layer.fc(input=[fc_last, lstm_last], - size=class_dim, - act=paddle.activation.Softmax(), - bias_attr=bias_attr, - param_attr=para_attr) - - lbl = paddle.layer.data("label", paddle.data_type.integer_value(2)) - cost = paddle.layer.classification_cost(input=output, label=lbl) - return cost, output -``` + fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max') + lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max') -1. Define input data and its dimension + prediction = fluid.layers.fc(input=[fc_last, lstm_last], + size=class_dim, + act='softmax') + return prediction - Parameter `input_dim` denotes the dictionary size, and `class_dim` is the number of categories. In `stacked_lstm_net`, the input to the network is defined in `paddle.layer.data`. +``` +The above stacked bidirectional LSTM network extracts high-level features and maps them to a vector of the same size as the categories. `paddle.activation.Softmax` function or classifier is then used for calculating the probability of the sentence belonging to each category. -1. Define Classifier +To reiterate, we can either invoke `convolution_net` or `stacked_lstm_net`. In below steps, we will go with `convolution_net`. - The above stacked bidirectional LSTM network extracts high-level features and maps them to a vector of the same size as the categories. `paddle.activation.Softmax` function or classifier is then used for calculating the probability of the sentence belonging to each category. +Next we define an `inference_program` that simply uses `convolution_net` to predict output with the input from `fluid.layer.data`. -1. Define Loss Function +```python +def inference_program(word_dict): + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + + dict_dim = len(word_dict) + net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM) + return net +``` - In the context of supervised learning, labels of the training set are defined in `paddle.layer.data`, too. During training, cross-entropy is used as loss function in `paddle.layer.classification_cost` and as the output of the network; During testing, the outputs are the probabilities calculated in the classifier. +Then we define a `training_program` that uses the result from `inference_program` to compute the cost with label data. +Also define `optimizer_func` to specify the optimizer. -To reiterate, we can either invoke `convolution_net` or `stacked_lstm_net`. +In the context of supervised learning, labels of the training set are defined in `paddle.layer.data` too. During training, cross-entropy is used as loss function in `paddle.layer.classification_cost` and as the output of the network; During testing, the outputs are the probabilities calculated in the classifier. +First result that returns from the list must be cost. ```python -word_dict = paddle.dataset.imdb.word_dict() -dict_dim = len(word_dict) -class_dim = 2 +def train_program(word_dict): + prediction = inference_program(word_dict) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(cost) + accuracy = fluid.layers.accuracy(input=prediction, label=label) + return [avg_cost, accuracy] + -# option 1 -[cost, output] = convolution_net(dict_dim, class_dim=class_dim) -# option 2 -# [cost, output] = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3) +def optimizer_func(): + return fluid.optimizer.Adagrad(learning_rate=0.002) ``` ## Model Training -### Define Parameters +### Specify training environment -First, we create the model parameters according to the previous model configuration `cost`. +Specify your training environment, you should specify if the training is on CPU or GPU. ```python -# create parameters -parameters = paddle.parameters.create(cost) +use_cuda = False +place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() ``` -### Create Trainer +### Datafeeder Configuration -Before jumping into creating a training module, algorithm setting is also necessary. -Here we specified `Adam` optimization algorithm via `paddle.optimizer`. +Next we define data feeders for test and train. The feeder reads a `buf_size` of data each time and feed them to the training/testing process. +`paddle.dataset.imdb.train` will yield records during each pass, after shuffling, a batch input of `BATCH_SIZE` is generated for training. -```python -# create optimizer -adam_optimizer = paddle.optimizer.Adam( - learning_rate=2e-3, - regularization=paddle.optimizer.L2Regularization(rate=8e-4), - model_average=paddle.optimizer.ModelAverage(average_window=0.5)) - -# create trainer -trainer = paddle.trainer.SGD(cost=cost, - parameters=parameters, - update_equation=adam_optimizer) -``` +Notice for loading and reading IMDB data, it could take up to 1 minute. Please be patient. -### Training +```python -`paddle.dataset.imdb.train()` will yield records during each pass, after shuffling, a batch input is generated for training. +print("Loading IMDB word dict....") +word_dict = paddle.dataset.imdb.word_dict() -```python +print ("Reading training data....") train_reader = paddle.batch( paddle.reader.shuffle( - lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000), - batch_size=100) + paddle.dataset.imdb.train(word_dict), buf_size=25000), + batch_size=BATCH_SIZE) +``` -test_reader = paddle.batch( - lambda: paddle.dataset.imdb.test(word_dict), batch_size=100) + +### Create Trainer + +Create a trainer that takes `train_program` as input and specify optimizer function. + +```python +trainer = fluid.Trainer( + train_func=partial(train_program, word_dict), + place=place, + optimizer_func=optimizer_func) ``` -`feeding` is devoted to specifying the correspondence between each yield record and `paddle.layer.data`. For instance, the first column of data generated by `paddle.dataset.imdb.train()` corresponds to `word` feature. +### Feeding Data + +`feed_order` is devoted to specifying the correspondence between each yield record and `paddle.layer.data`. For instance, the first column of data generated by `imdb.train` corresponds to `words`. ```python -feeding = {'word': 0, 'label': 1} +feed_order = ['words', 'label'] ``` -Callback function `event_handler` will be invoked to track training progress when a pre-defined event happens. +### Event Handler + +Callback function `event_handler` will be called during training when a pre-defined event happens. +For example, we can check the cost by `trainer.test` when `EndStepEvent` occurs ```python +# Specify the directory path to save the parameters +params_dirname = "understand_sentiment_conv.inference.model" + def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - print "\nPass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) - else: - sys.stdout.write('.') - sys.stdout.flush() - if isinstance(event, paddle.event.EndPass): - with open('./params_pass_%d.tar' % event.pass_id, 'w') as f: - trainer.save_parameter_to_tar(f) - - result = trainer.test(reader=test_reader, feeding=feeding) - print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) + if isinstance(event, fluid.EndStepEvent): + print("Step {0}, Epoch {1} Metrics {2}".format( + event.step, event.epoch, map(np.array, event.metrics))) + + if event.step == 10: + trainer.save_params(params_dirname) + trainer.stop() ``` -Finally, we can invoke `trainer.train` to start training: +### Training + +Finally, we invoke `trainer.train` to start training with `num_epochs` and other parameters. ```python trainer.train( - reader=train_reader, + num_epochs=1, event_handler=event_handler, - feeding=feeding, - num_passes=10) + reader=train_reader, + feed_order=feed_order) ``` +## Inference + +### Create Inferencer + +Initialize Inferencer with `inference_program` and `params_dirname` which is where we save params from training. + +```python +inferencer = fluid.Inferencer( + infer_func=partial(inference_program, word_dict), + param_path=params_dirname, + place=place) +``` + +### Create Lod Tensor with test data + +To do inference, we pick 3 potential reviews out of our mind as testing data. Feel free to modify any of them. +We map each word in the reviews to id from `word_dict`, replaced by 'unknown' if the word is not in `word_dict`. +Then we create lod data with the id list and use `create_lod_tensor` to create lod tensor. + +```python +reviews_str = [ + 'read the book forget the movie', 'this is a great movie', 'this is very bad' +] +reviews = [c.split() for c in reviews_str] + +UNK = word_dict[''] +lod = [] +for c in reviews: + lod.append([word_dict.get(words, UNK) for words in c]) + +base_shape = [[len(c) for c in lod]] + +tensor_words = fluid.create_lod_tensor(lod, base_shape, place) +``` + +### Infer + +Now we can infer and predict probability of positive or negative from each review above. + +```python +results = inferencer.infer({'words': tensor_words}) + +for i, r in enumerate(results[0]): + print("Predict probability of ", r[0], " to be positive and ", r[1], " to be negative for review \'", reviews_str[i], "\'") + + +``` ## Conclusion diff --git a/06.understand_sentiment/train.py b/06.understand_sentiment/train.py deleted file mode 100644 index 58f61700c682b9c8210aba4ea9700cd1ddd76976..0000000000000000000000000000000000000000 --- a/06.understand_sentiment/train.py +++ /dev/null @@ -1,161 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys, os -import paddle.v2 as paddle - -with_gpu = os.getenv('WITH_GPU', '0') != '0' - - -def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128): - data = paddle.layer.data("word", - paddle.data_type.integer_value_sequence(input_dim)) - emb = paddle.layer.embedding(input=data, size=emb_dim) - conv_3 = paddle.networks.sequence_conv_pool( - input=emb, context_len=3, hidden_size=hid_dim) - conv_4 = paddle.networks.sequence_conv_pool( - input=emb, context_len=4, hidden_size=hid_dim) - output = paddle.layer.fc( - input=[conv_3, conv_4], size=class_dim, act=paddle.activation.Softmax()) - lbl = paddle.layer.data("label", paddle.data_type.integer_value(2)) - cost = paddle.layer.classification_cost(input=output, label=lbl) - return cost, output - - -def stacked_lstm_net(input_dim, - class_dim=2, - emb_dim=128, - hid_dim=512, - stacked_num=3): - """ - A Wrapper for sentiment classification task. - This network uses bi-directional recurrent network, - consisting three LSTM layers. This configure is referred to - the paper as following url, but use fewer layrs. - http://www.aclweb.org/anthology/P15-1109 - - input_dim: here is word dictionary dimension. - class_dim: number of categories. - emb_dim: dimension of word embedding. - hid_dim: dimension of hidden layer. - stacked_num: number of stacked lstm-hidden layer. - """ - assert stacked_num % 2 == 1 - - fc_para_attr = paddle.attr.Param(learning_rate=1e-3) - lstm_para_attr = paddle.attr.Param(initial_std=0., learning_rate=1.) - para_attr = [fc_para_attr, lstm_para_attr] - bias_attr = paddle.attr.Param(initial_std=0., l2_rate=0.) - relu = paddle.activation.Relu() - linear = paddle.activation.Linear() - - data = paddle.layer.data("word", - paddle.data_type.integer_value_sequence(input_dim)) - emb = paddle.layer.embedding(input=data, size=emb_dim) - - fc1 = paddle.layer.fc( - input=emb, size=hid_dim, act=linear, bias_attr=bias_attr) - lstm1 = paddle.layer.lstmemory(input=fc1, act=relu, bias_attr=bias_attr) - - inputs = [fc1, lstm1] - for i in range(2, stacked_num + 1): - fc = paddle.layer.fc( - input=inputs, - size=hid_dim, - act=linear, - param_attr=para_attr, - bias_attr=bias_attr) - lstm = paddle.layer.lstmemory( - input=fc, reverse=(i % 2) == 0, act=relu, bias_attr=bias_attr) - inputs = [fc, lstm] - - fc_last = paddle.layer.pooling( - input=inputs[0], pooling_type=paddle.pooling.Max()) - lstm_last = paddle.layer.pooling( - input=inputs[1], pooling_type=paddle.pooling.Max()) - output = paddle.layer.fc( - input=[fc_last, lstm_last], - size=class_dim, - act=paddle.activation.Softmax(), - bias_attr=bias_attr, - param_attr=para_attr) - - lbl = paddle.layer.data("label", paddle.data_type.integer_value(2)) - cost = paddle.layer.classification_cost(input=output, label=lbl) - return cost, output - - -if __name__ == '__main__': - # init - paddle.init(use_gpu=with_gpu) - - #data - print 'load dictionary...' - word_dict = paddle.dataset.imdb.word_dict() - dict_dim = len(word_dict) - class_dim = 2 - train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.imdb.train(word_dict), buf_size=1000), - batch_size=100) - test_reader = paddle.batch( - paddle.dataset.imdb.test(word_dict), batch_size=100) - - feeding = {'word': 0, 'label': 1} - - # network config - # Please choose the way to build the network - # by uncommenting the corresponding line. - [cost, output] = convolution_net(dict_dim, class_dim=class_dim) - # [cost, output] = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3) - - # create parameters - parameters = paddle.parameters.create(cost) - - # create optimizer - adam_optimizer = paddle.optimizer.Adam( - learning_rate=2e-3, - regularization=paddle.optimizer.L2Regularization(rate=8e-4), - model_average=paddle.optimizer.ModelAverage(average_window=0.5)) - - # create trainer - trainer = paddle.trainer.SGD( - cost=cost, parameters=parameters, update_equation=adam_optimizer) - - # End batch and end pass event handler - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - print "\nPass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) - else: - sys.stdout.write('.') - sys.stdout.flush() - if isinstance(event, paddle.event.EndPass): - with open('./params_pass_%d.tar' % event.pass_id, 'w') as f: - trainer.save_parameter_to_tar(f) - - result = trainer.test(reader=test_reader, feeding=feeding) - print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) - - # Save the inference topology to protobuf. - inference_topology = paddle.topology.Topology(layers=output) - with open("./inference_topology.pkl", 'wb') as f: - inference_topology.serialize_for_inference(f) - - trainer.train( - reader=train_reader, - event_handler=event_handler, - feeding=feeding, - num_passes=20) diff --git a/06.understand_sentiment/train_conv.py b/06.understand_sentiment/train_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..61fe18e40743629b5fabade1ab4c713d3afe5cba --- /dev/null +++ b/06.understand_sentiment/train_conv.py @@ -0,0 +1,166 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import paddle +import paddle.fluid as fluid +from functools import partial +import numpy as np + +CLASS_DIM = 2 +EMB_DIM = 128 +HID_DIM = 512 +BATCH_SIZE = 128 + + +def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim): + emb = fluid.layers.embedding( + input=data, size=[input_dim, emb_dim], is_sparse=True) + conv_3 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=hid_dim, + filter_size=3, + act="tanh", + pool_type="sqrt") + conv_4 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=hid_dim, + filter_size=4, + act="tanh", + pool_type="sqrt") + prediction = fluid.layers.fc( + input=[conv_3, conv_4], size=class_dim, act="softmax") + return prediction + + +def inference_program(word_dict): + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + + dict_dim = len(word_dict) + net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM) + return net + + +def train_program(word_dict): + prediction = inference_program(word_dict) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(cost) + accuracy = fluid.layers.accuracy(input=prediction, label=label) + return [avg_cost, accuracy] + + +def optimizer_func(): + return fluid.optimizer.Adagrad(learning_rate=0.002) + + +def train(use_cuda, train_program, params_dirname): + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + print("Loading IMDB word dict....") + word_dict = paddle.dataset.imdb.word_dict() + + print("Reading training data....") + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imdb.train(word_dict), buf_size=25000), + batch_size=BATCH_SIZE) + + print("Reading testing data....") + test_reader = paddle.batch( + paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE) + + trainer = fluid.Trainer( + train_func=partial(train_program, word_dict), + place=place, + optimizer_func=optimizer_func) + + feed_order = ['words', 'label'] + + def event_handler(event): + if isinstance(event, fluid.EndStepEvent): + if event.step % 10 == 0: + avg_cost, acc = trainer.test( + reader=test_reader, feed_order=feed_order) + + print('Step {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format( + event.step, avg_cost, acc)) + + print("Step {0}, Epoch {1} Metrics {2}".format( + event.step, event.epoch, map(np.array, event.metrics))) + + elif isinstance(event, fluid.EndEpochEvent): + trainer.save_params(params_dirname) + + trainer.train( + num_epochs=1, + event_handler=event_handler, + reader=train_reader, + feed_order=feed_order) + + +def infer(use_cuda, inference_program, params_dirname=None): + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + word_dict = paddle.dataset.imdb.word_dict() + + inferencer = fluid.Inferencer( + infer_func=partial(inference_program, word_dict), + param_path=params_dirname, + place=place) + + # Setup input by creating LoDTensor to represent sequence of words. + # Here each word is the basic element of the LoDTensor and the shape of + # each word (base_shape) should be [1] since it is simply an index to + # look up for the corresponding word vector. + # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]], + # which has only one lod level. Then the created LoDTensor will have only + # one higher level structure (sequence of words, or sentence) than the basic + # element (word). Hence the LoDTensor will hold data for three sentences of + # length 3, 4 and 2, respectively. + # Note that lod info should be a list of lists. + + reviews_str = [ + 'read the book forget the movie', 'this is a great movie', + 'this is very bad' + ] + reviews = [c.split() for c in reviews_str] + + UNK = word_dict[''] + lod = [] + for c in reviews: + lod.append([word_dict.get(words, UNK) for words in c]) + + base_shape = [[len(c) for c in lod]] + + tensor_words = fluid.create_lod_tensor(lod, base_shape, place) + results = inferencer.infer({'words': tensor_words}) + + for i, r in enumerate(results[0]): + print("Predict probability of ", r[0], " to be positive and ", r[1], + " to be negative for review \'", reviews_str[i], "\'") + + +def main(use_cuda): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + params_dirname = "understand_sentiment_conv.inference.model" + train(use_cuda, train_program, params_dirname) + infer(use_cuda, inference_program, params_dirname) + + +if __name__ == '__main__': + use_cuda = os.getenv('WITH_GPU', '0') != '0' + main(use_cuda) diff --git a/06.understand_sentiment/train_dyn_rnn.py b/06.understand_sentiment/train_dyn_rnn.py new file mode 100644 index 0000000000000000000000000000000000000000..ef3be26ffc939597dec6d298217f39c0f30de931 --- /dev/null +++ b/06.understand_sentiment/train_dyn_rnn.py @@ -0,0 +1,183 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import paddle +import paddle.fluid as fluid +from functools import partial +import numpy as np + +CLASS_DIM = 2 +EMB_DIM = 128 +BATCH_SIZE = 128 +LSTM_SIZE = 128 +USE_GPU = False + + +def dynamic_rnn_lstm(data, input_dim, class_dim, emb_dim, lstm_size): + emb = fluid.layers.embedding( + input=data, size=[input_dim, emb_dim], is_sparse=True) + sentence = fluid.layers.fc(input=emb, size=lstm_size, act='tanh') + + rnn = fluid.layers.DynamicRNN() + with rnn.block(): + word = rnn.step_input(sentence) + prev_hidden = rnn.memory(value=0.0, shape=[lstm_size]) + prev_cell = rnn.memory(value=0.0, shape=[lstm_size]) + + def gate_common(ipt, hidden, size): + gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True) + gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False) + return gate0 + gate1 + + forget_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden, + lstm_size)) + input_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden, + lstm_size)) + output_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden, + lstm_size)) + cell_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden, + lstm_size)) + + cell = forget_gate * prev_cell + input_gate * cell_gate + hidden = output_gate * fluid.layers.tanh(x=cell) + rnn.update_memory(prev_cell, cell) + rnn.update_memory(prev_hidden, hidden) + rnn.output(hidden) + + last = fluid.layers.sequence_last_step(rnn()) + prediction = fluid.layers.fc(input=last, size=class_dim, act="softmax") + return prediction + + +def inference_program(word_dict): + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + + dict_dim = len(word_dict) + pred = dynamic_rnn_lstm(data, dict_dim, CLASS_DIM, EMB_DIM, LSTM_SIZE) + return pred + + +def train_program(word_dict): + prediction = inference_program(word_dict) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(cost) + accuracy = fluid.layers.accuracy(input=prediction, label=label) + return [avg_cost, accuracy] + + +def optimizer_func(): + return fluid.optimizer.Adagrad(learning_rate=0.002) + + +def train(use_cuda, train_program, params_dirname): + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + print("Loading IMDB word dict....") + word_dict = paddle.dataset.imdb.word_dict() + + print("Reading training data....") + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imdb.train(word_dict), buf_size=25000), + batch_size=BATCH_SIZE) + + print("Reading testing data....") + test_reader = paddle.batch( + paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE) + + trainer = fluid.Trainer( + train_func=partial(train_program, word_dict), + place=place, + optimizer_func=optimizer_func) + + feed_order = ['words', 'label'] + + def event_handler(event): + if isinstance(event, fluid.EndStepEvent): + if event.step % 10 == 0: + avg_cost, acc = trainer.test( + reader=test_reader, feed_order=feed_order) + + print('Step {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format( + event.step, avg_cost, acc)) + + print("Step {0}, Epoch {1} Metrics {2}".format( + event.step, event.epoch, map(np.array, event.metrics))) + + elif isinstance(event, fluid.EndEpochEvent): + trainer.save_params(params_dirname) + + trainer.train( + num_epochs=1, + event_handler=event_handler, + reader=train_reader, + feed_order=feed_order) + + +def infer(use_cuda, inference_program, params_dirname=None): + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + word_dict = paddle.dataset.imdb.word_dict() + + inferencer = fluid.Inferencer( + infer_func=partial(inference_program, word_dict), + param_path=params_dirname, + place=place) + + # Setup input by creating LoDTensor to represent sequence of words. + # Here each word is the basic element of the LoDTensor and the shape of + # each word (base_shape) should be [1] since it is simply an index to + # look up for the corresponding word vector. + # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]], + # which has only one lod level. Then the created LoDTensor will have only + # one higher level structure (sequence of words, or sentence) than the basic + # element (word). Hence the LoDTensor will hold data for three sentences of + # length 3, 4 and 2, respectively. + # Note that lod info should be a list of lists. + + reviews_str = [ + 'read the book forget the movie', 'this is a great movie', + 'this is very bad' + ] + reviews = [c.split() for c in reviews_str] + + UNK = word_dict[''] + lod = [] + for c in reviews: + lod.append([word_dict.get(words, UNK) for words in c]) + + base_shape = [[len(c) for c in lod]] + + tensor_words = fluid.create_lod_tensor(lod, base_shape, place) + results = inferencer.infer({'words': tensor_words}) + + for i, r in enumerate(results[0]): + print("Predict probability of ", r[0], " to be positive and ", r[1], + " to be negative for review \'", reviews_str[i], "\'") + + +def main(use_cuda): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + params_dirname = "understand_sentiment_conv.inference.model" + train(use_cuda, train_program, params_dirname) + infer(use_cuda, inference_program, params_dirname) + + +if __name__ == '__main__': + use_cuda = os.getenv('WITH_GPU', '0') != '0' + main(use_cuda) diff --git a/06.understand_sentiment/train_stacked_lstm.py b/06.understand_sentiment/train_stacked_lstm.py new file mode 100644 index 0000000000000000000000000000000000000000..351994f82a01a08ce7fc9425b90f9260383ee2f6 --- /dev/null +++ b/06.understand_sentiment/train_stacked_lstm.py @@ -0,0 +1,174 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import paddle +import paddle.fluid as fluid +from functools import partial +import numpy as np + +CLASS_DIM = 2 +EMB_DIM = 128 +HID_DIM = 512 +STACKED_NUM = 3 +BATCH_SIZE = 128 +USE_GPU = False + + +def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num): + assert stacked_num % 2 == 1 + + emb = fluid.layers.embedding( + input=data, size=[input_dim, emb_dim], is_sparse=True) + + fc1 = fluid.layers.fc(input=emb, size=hid_dim) + lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim) + + inputs = [fc1, lstm1] + + for i in range(2, stacked_num + 1): + fc = fluid.layers.fc(input=inputs, size=hid_dim) + lstm, cell = fluid.layers.dynamic_lstm( + input=fc, size=hid_dim, is_reverse=(i % 2) == 0) + inputs = [fc, lstm] + + fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max') + lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max') + + prediction = fluid.layers.fc( + input=[fc_last, lstm_last], size=class_dim, act='softmax') + return prediction + + +def inference_program(word_dict): + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + + dict_dim = len(word_dict) + net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM, + STACKED_NUM) + return net + + +def train_program(word_dict): + prediction = inference_program(word_dict) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(cost) + accuracy = fluid.layers.accuracy(input=prediction, label=label) + return [avg_cost, accuracy] + + +def optimizer_func(): + return fluid.optimizer.Adagrad(learning_rate=0.002) + + +def train(use_cuda, train_program, params_dirname): + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + print("Loading IMDB word dict....") + word_dict = paddle.dataset.imdb.word_dict() + + print("Reading training data....") + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imdb.train(word_dict), buf_size=25000), + batch_size=BATCH_SIZE) + + print("Reading testing data....") + test_reader = paddle.batch( + paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE) + + trainer = fluid.Trainer( + train_func=partial(train_program, word_dict), + place=place, + optimizer_func=optimizer_func) + + feed_order = ['words', 'label'] + + def event_handler(event): + if isinstance(event, fluid.EndStepEvent): + if event.step % 10 == 0: + avg_cost, acc = trainer.test( + reader=test_reader, feed_order=feed_order) + + print('Step {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format( + event.step, avg_cost, acc)) + + print("Step {0}, Epoch {1} Metrics {2}".format( + event.step, event.epoch, map(np.array, event.metrics))) + + elif isinstance(event, fluid.EndEpochEvent): + trainer.save_params(params_dirname) + + trainer.train( + num_epochs=1, + event_handler=event_handler, + reader=train_reader, + feed_order=feed_order) + + +def infer(use_cuda, inference_program, params_dirname=None): + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + word_dict = paddle.dataset.imdb.word_dict() + + inferencer = fluid.Inferencer( + infer_func=partial(inference_program, word_dict), + param_path=params_dirname, + place=place) + + # Setup input by creating LoDTensor to represent sequence of words. + # Here each word is the basic element of the LoDTensor and the shape of + # each word (base_shape) should be [1] since it is simply an index to + # look up for the corresponding word vector. + # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]], + # which has only one lod level. Then the created LoDTensor will have only + # one higher level structure (sequence of words, or sentence) than the basic + # element (word). Hence the LoDTensor will hold data for three sentences of + # length 3, 4 and 2, respectively. + # Note that lod info should be a list of lists. + + reviews_str = [ + 'read the book forget the movie', 'this is a great movie', + 'this is very bad' + ] + reviews = [c.split() for c in reviews_str] + + UNK = word_dict[''] + lod = [] + for c in reviews: + lod.append([word_dict.get(words, UNK) for words in c]) + + base_shape = [[len(c) for c in lod]] + + tensor_words = fluid.create_lod_tensor(lod, base_shape, place) + results = inferencer.infer({'words': tensor_words}) + + for i, r in enumerate(results[0]): + print("Predict probability of ", r[0], " to be positive and ", r[1], + " to be negative for review \'", reviews_str[i], "\'") + + +def main(use_cuda): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + params_dirname = "understand_sentiment_stacked_lstm.inference.model" + train(use_cuda, train_program, params_dirname) + infer(use_cuda, inference_program, params_dirname) + + +if __name__ == '__main__': + use_cuda = os.getenv('WITH_GPU', '0') != '0' + main(use_cuda)