diff --git a/.clang-format b/.clang-format index 6bbd46d0ff956517991d4faad3f2c026487f412b..9ba433b17362424973626470d930356c2173dd84 100644 --- a/.clang-format +++ b/.clang-format @@ -13,8 +13,6 @@ # The document of clang-format is # http://clang.llvm.org/docs/ClangFormat.html # http://clang.llvm.org/docs/ClangFormatStyleOptions.html -# -# TODO(yuyang18): Add python and other language code style --- Language: Cpp BasedOnStyle: Google @@ -22,8 +20,9 @@ IndentWidth: 2 TabWidth: 2 ContinuationIndentWidth: 4 AccessModifierOffset: -2 # The private/protected/public has no indent in class -PointerAlignment: Left # int* p/int& p, not int *p/int &p Standard: Cpp11 AllowAllParametersOfDeclarationOnNextLine: true +BinPackParameters: false +BinPackArguments: false ... diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9385943da92bc8c44ca75b267a768ba8ea22bd8b --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,24 @@ +- repo: https://github.com/Lucas-C/pre-commit-hooks.git + sha: c25201a00e6b0514370501050cf2a8538ac12270 + hooks: + - id: remove-crlf +- repo: https://github.com/reyoung/mirrors-yapf.git + sha: v0.13.2 + hooks: + - id: yapf +- repo: https://github.com/pre-commit/pre-commit-hooks + sha: 4ef03c4223ad322c7adaa6c6c0efb26b57df3b71 + hooks: + - id: check-added-large-files + - id: check-merge-conflict + - id: check-symlinks + - id: detect-private-key + - id: end-of-file-fixer +# TODO(yuyang): trailing whitespace has some bugs on markdown +# files now, please not add it to pre-commit hook now +# - id: trailing-whitespace +# +# TODO(yuyang): debug-statements not fit for Paddle, because +# not all of our python code is runnable. Some are used for +# documenation +# - id: debug-statements diff --git a/demo/introduction/README.md b/demo/introduction/README.md index bebf1d090d98691199ede55736dfe5b964a8fd42..0614a7afe645677ef0b65a17ea05f1dcfa45214f 100644 --- a/demo/introduction/README.md +++ b/demo/introduction/README.md @@ -1,4 +1,3 @@ This folder contains scripts used in PaddlePaddle introduction. - use `bash train.sh` to train a simple linear regression model - use `python evaluate_model.py` to read model parameters. You can see that `w` and `b` are very close to [2, 0.3]. - diff --git a/demo/mnist/data/get_mnist_data.sh b/demo/mnist/data/get_mnist_data.sh index 9099b5ab6fb85d86d346a7ad819538fbd013c6ff..5a2e34026d4fe7f8315d4f5453bec7c4ee4f6885 100755 --- a/demo/mnist/data/get_mnist_data.sh +++ b/demo/mnist/data/get_mnist_data.sh @@ -19,4 +19,3 @@ done cd $DIR rm -f *.list python generate_list.py - diff --git a/demo/recommendation/data/config.json b/demo/recommendation/data/config.json index 71a9dd7be6bd10e177dfb443a94b719c3816d833..f26e74ce47bb7843a571e6033f051c046b31f054 100644 --- a/demo/recommendation/data/config.json +++ b/demo/recommendation/data/config.json @@ -14,4 +14,3 @@ "fields": ["id", "title", "genres"] } } - diff --git a/demo/semantic_role_labeling/test.sh b/demo/semantic_role_labeling/test.sh index 804f722e5b8e9ee5b54c778c54f7833f5e6c4de0..844649e8c0f6867dc0766e4ec6f250c5a4a004d9 100644 --- a/demo/semantic_role_labeling/test.sh +++ b/demo/semantic_role_labeling/test.sh @@ -37,4 +37,3 @@ paddle train \ --use_gpu=false \ --config_args=is_test=1 \ 2>&1 | tee 'test.log' - diff --git a/demo/semantic_role_labeling/train.sh b/demo/semantic_role_labeling/train.sh index 94c7b6f31df3b5e5e059d6e1323ae0c0bec74753..c3a22b644be0ca08a2af73a57c09657014e49bfc 100644 --- a/demo/semantic_role_labeling/train.sh +++ b/demo/semantic_role_labeling/train.sh @@ -24,4 +24,3 @@ paddle train \ --show_parameter_stats_period=10 \ --test_all_data_in_one_period=1 \ 2>&1 | tee 'train.log' - diff --git a/doc/demo/semantic_role_labeling/semantic_role_labeling.md b/doc/demo/semantic_role_labeling/semantic_role_labeling.md index 05fbc8278daf204df60ad19b742c920e47128c27..890f7314582c65e9add50664006b57aa4e0709eb 100644 --- a/doc/demo/semantic_role_labeling/semantic_role_labeling.md +++ b/doc/demo/semantic_role_labeling/semantic_role_labeling.md @@ -1,183 +1,183 @@ -# Semantic Role labeling Tutorial # - -Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering. An instance is as following [1]: - - [ A0 He ] [ AM-MOD would ][ AM-NEG n’t ] [ V accept] [ A1 anything of value ] from [A2 those he was writing about ]. - -- V: verb -- A0: acceptor -- A1: thing accepted -- A2: accepted-from -- A3: Attribute -- AM-MOD: modal -- AM-NEG: negation - -Given the verb "accept", the chunks in sentence would play certain semantic roles. Here, the label scheme is from Penn Proposition Bank. - -To this date, most of the successful SRL systems are built on top of some form of parsing results where pre-defined feature templates over the syntactic structure are used. This tutorial will present an end-to-end system using deep bidirectional long short-term memory (DB-LSTM)[2] for solving the SRL task, which largely outperforms the previous state-of-the-art systems. The system regards SRL task as the sequence labelling problem. - -## Data Description -The relevant paper[2] takes the data set in CoNLL-2005&2012 Shared Task for training and testing. Accordingto data license, the demo adopts the test data set of CoNLL-2005, which can be reached on website. - -To download and process the original data, user just need to execute the following command: - -```bash -cd data -./get_data.sh -``` -Several new files appear in the `data `directory as follows. -```bash -conll05st-release:the test data set of CoNll-2005 shared task -test.wsj.words:the Wall Street Journal data sentences -test.wsj.props: the propositional arguments -src.dict:the dictionary of words in sentences -tgt.dict:the labels dictionary -feature: the extracted features from data set -``` - -## Training -### DB-LSTM -Please refer to the Sentiment Analysis demo to learn more about the long short-term memory unit. - -Unlike Bidirectional-LSTM that used in Sentiment Analysis demo, the DB-LSTM adopts another way to stack LSTM layer. First a standard LSTM processes the sequence in forward direction. The input and output of this LSTM layer are taken by the next LSTM layer as input, processed in reversed direction. These two standard LSTM layers compose a pair of LSTM. Then we stack LSTM layers pair after pair to obtain the deep LSTM model. - -The following figure shows a temporal expanded 2-layer DB-LSTM network. -
-![pic](./network_arch.png) -
- -### Features -Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark mr = 1 to denote the argument position if it locates in the predicate context region, or mr = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]: -
-![pic](./feature.jpg) -
- -In this sample, the coresponding labelled sentence is: - -[ A1 A record date ] has [ AM-NEG n't ] been [ V set ] . - -In the demo, we adopt the feature template as above, consists of : `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` and use `B/I/O` scheme to label each argument. These features and labels are stored in `feature` file, and separated by `\t`. - -### Data Provider - -`dataprovider.py` is the python file to wrap data. `hook()` function is to define the data slots for network. The Six features and label are all IndexSlots. -``` -def hook(settings, word_dict, label_dict, **kwargs): - settings.word_dict = word_dict - settings.label_dict = label_dict - #all inputs are integral and sequential type - settings.slots = [ - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(2), - integer_value_sequence(len(label_dict))] -``` -The corresponding data iterator is as following: -``` -@provider(use_seq=True, init_hook=hook) -def process(obj, file_name): - with open(file_name, 'r') as fdata: - for line in fdata: - sentence, predicate, ctx_n1, ctx_0, ctx_p1, mark, label = line.strip().split('\t') - words = sentence.split() - sen_len = len(words) - word_slot = [obj.word_dict.get(w, UNK_IDX) for w in words] - - predicate_slot = [obj.word_dict.get(predicate, UNK_IDX)] * sen_len - ctx_n1_slot = [obj.word_dict.get(ctx_n1, UNK_IDX) ] * sen_len - ctx_0_slot = [obj.word_dict.get(ctx_0, UNK_IDX) ] * sen_len - ctx_p1_slot = [obj.word_dict.get(ctx_p1, UNK_IDX) ] * sen_len - - marks = mark.split() - mark_slot = [int(w) for w in marks] - - label_list = label.split() - label_slot = [obj.label_dict.get(w) for w in label_list] - - yield word_slot, predicate_slot, ctx_n1_slot, ctx_0_slot, ctx_p1_slot, mark_slot, label_slot -``` -The `process`function yield 7 lists which are six features and labels. - -### Neural Network Config -`db_lstm.py` is the neural network config file to load the dictionaries and define the data provider module and network architecture during the training procedure. - -Seven `data_layer` load instances from data provider. Six features are transformed into embedddings respectively, and mixed by `mixed_layer` . Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels. - -### Run Training -The script for training is `train.sh`, user just need to execute: -```bash - ./train.sh -``` -The content in `train.sh`: -``` -paddle train \ - --config=./db_lstm.py \ - --save_dir=./output \ - --trainer_count=4 \ - --log_period=10 \ - --num_passes=500 \ - --use_gpu=false \ - --show_parameter_stats_period=10 \ - --test_all_data_in_one_period=1 \ -2>&1 | tee 'train.log' -``` - -- \--config=./db_lstm.py : network config file. -- \--save_di=./output: output path to save models. -- \--trainer_count=4 : set thread number (or GPU count). -- \--log_period=10 : print log every 20 batches. -- \--num_passes=500: set pass number, one pass in PaddlePaddle means training all samples in dataset one time. -- \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train. -- \--show_parameter_stats_period=10: show parameter statistic every 100 batches. -- \--test_all_data_in_one_period=1: test all data in every testing. - - -After training, the models will be saved in directory `output`. - -### Run testing -The script for testing is `test.sh`, user just need to execute: -```bash - ./test.sh -``` -The main part in `tesh.sh` -``` -paddle train \ - --config=./db_lstm.py \ - --model_list=$model_list \ - --job=test \ - --config_args=is_test=1 \ -``` - - - \--config=./db_lstm.py: network config file - - \--model_list=$model_list.list: model list file - - \--job=test: indicate the test job - - \--config_args=is_test=1: flag to indicate test - - -### Run prediction -The script for prediction is `predict.sh`, user just need to execute: -```bash - ./predict.sh - -``` -In `predict.sh`, user should offer the network config file, model path, label file, word dictionary file, feature file -``` -python predict.py - -c $config_file - -w $model_path - -l $label_file - -d $dict_file - -i $input_file -``` - -`predict.py` is the main executable python script, which includes functions: load model, load data, data prediction. The network model will output the probability distribution of labels. In the demo, we take the label with maximum probability as result. User can also implement the beam search or viterbi decoding upon the probability distribution matrix. - -After prediction, the result is saved in `predict.res`. - -## Reference -[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. - -[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015. +# Semantic Role labeling Tutorial # + +Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering. An instance is as following [1]: + + [ A0 He ] [ AM-MOD would ][ AM-NEG n’t ] [ V accept] [ A1 anything of value ] from [A2 those he was writing about ]. + +- V: verb +- A0: acceptor +- A1: thing accepted +- A2: accepted-from +- A3: Attribute +- AM-MOD: modal +- AM-NEG: negation + +Given the verb "accept", the chunks in sentence would play certain semantic roles. Here, the label scheme is from Penn Proposition Bank. + +To this date, most of the successful SRL systems are built on top of some form of parsing results where pre-defined feature templates over the syntactic structure are used. This tutorial will present an end-to-end system using deep bidirectional long short-term memory (DB-LSTM)[2] for solving the SRL task, which largely outperforms the previous state-of-the-art systems. The system regards SRL task as the sequence labelling problem. + +## Data Description +The relevant paper[2] takes the data set in CoNLL-2005&2012 Shared Task for training and testing. Accordingto data license, the demo adopts the test data set of CoNLL-2005, which can be reached on website. + +To download and process the original data, user just need to execute the following command: + +```bash +cd data +./get_data.sh +``` +Several new files appear in the `data `directory as follows. +```bash +conll05st-release:the test data set of CoNll-2005 shared task +test.wsj.words:the Wall Street Journal data sentences +test.wsj.props: the propositional arguments +src.dict:the dictionary of words in sentences +tgt.dict:the labels dictionary +feature: the extracted features from data set +``` + +## Training +### DB-LSTM +Please refer to the Sentiment Analysis demo to learn more about the long short-term memory unit. + +Unlike Bidirectional-LSTM that used in Sentiment Analysis demo, the DB-LSTM adopts another way to stack LSTM layer. First a standard LSTM processes the sequence in forward direction. The input and output of this LSTM layer are taken by the next LSTM layer as input, processed in reversed direction. These two standard LSTM layers compose a pair of LSTM. Then we stack LSTM layers pair after pair to obtain the deep LSTM model. + +The following figure shows a temporal expanded 2-layer DB-LSTM network. +
+![pic](./network_arch.png) +
+ +### Features +Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark mr = 1 to denote the argument position if it locates in the predicate context region, or mr = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]: +
+![pic](./feature.jpg) +
+ +In this sample, the coresponding labelled sentence is: + +[ A1 A record date ] has [ AM-NEG n't ] been [ V set ] . + +In the demo, we adopt the feature template as above, consists of : `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` and use `B/I/O` scheme to label each argument. These features and labels are stored in `feature` file, and separated by `\t`. + +### Data Provider + +`dataprovider.py` is the python file to wrap data. `hook()` function is to define the data slots for network. The Six features and label are all IndexSlots. +``` +def hook(settings, word_dict, label_dict, **kwargs): + settings.word_dict = word_dict + settings.label_dict = label_dict + #all inputs are integral and sequential type + settings.slots = [ + integer_value_sequence(len(word_dict)), + integer_value_sequence(len(word_dict)), + integer_value_sequence(len(word_dict)), + integer_value_sequence(len(word_dict)), + integer_value_sequence(len(word_dict)), + integer_value_sequence(2), + integer_value_sequence(len(label_dict))] +``` +The corresponding data iterator is as following: +``` +@provider(use_seq=True, init_hook=hook) +def process(obj, file_name): + with open(file_name, 'r') as fdata: + for line in fdata: + sentence, predicate, ctx_n1, ctx_0, ctx_p1, mark, label = line.strip().split('\t') + words = sentence.split() + sen_len = len(words) + word_slot = [obj.word_dict.get(w, UNK_IDX) for w in words] + + predicate_slot = [obj.word_dict.get(predicate, UNK_IDX)] * sen_len + ctx_n1_slot = [obj.word_dict.get(ctx_n1, UNK_IDX) ] * sen_len + ctx_0_slot = [obj.word_dict.get(ctx_0, UNK_IDX) ] * sen_len + ctx_p1_slot = [obj.word_dict.get(ctx_p1, UNK_IDX) ] * sen_len + + marks = mark.split() + mark_slot = [int(w) for w in marks] + + label_list = label.split() + label_slot = [obj.label_dict.get(w) for w in label_list] + + yield word_slot, predicate_slot, ctx_n1_slot, ctx_0_slot, ctx_p1_slot, mark_slot, label_slot +``` +The `process`function yield 7 lists which are six features and labels. + +### Neural Network Config +`db_lstm.py` is the neural network config file to load the dictionaries and define the data provider module and network architecture during the training procedure. + +Seven `data_layer` load instances from data provider. Six features are transformed into embedddings respectively, and mixed by `mixed_layer` . Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels. + +### Run Training +The script for training is `train.sh`, user just need to execute: +```bash + ./train.sh +``` +The content in `train.sh`: +``` +paddle train \ + --config=./db_lstm.py \ + --save_dir=./output \ + --trainer_count=4 \ + --log_period=10 \ + --num_passes=500 \ + --use_gpu=false \ + --show_parameter_stats_period=10 \ + --test_all_data_in_one_period=1 \ +2>&1 | tee 'train.log' +``` + +- \--config=./db_lstm.py : network config file. +- \--save_di=./output: output path to save models. +- \--trainer_count=4 : set thread number (or GPU count). +- \--log_period=10 : print log every 20 batches. +- \--num_passes=500: set pass number, one pass in PaddlePaddle means training all samples in dataset one time. +- \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train. +- \--show_parameter_stats_period=10: show parameter statistic every 100 batches. +- \--test_all_data_in_one_period=1: test all data in every testing. + + +After training, the models will be saved in directory `output`. + +### Run testing +The script for testing is `test.sh`, user just need to execute: +```bash + ./test.sh +``` +The main part in `tesh.sh` +``` +paddle train \ + --config=./db_lstm.py \ + --model_list=$model_list \ + --job=test \ + --config_args=is_test=1 \ +``` + + - \--config=./db_lstm.py: network config file + - \--model_list=$model_list.list: model list file + - \--job=test: indicate the test job + - \--config_args=is_test=1: flag to indicate test + + +### Run prediction +The script for prediction is `predict.sh`, user just need to execute: +```bash + ./predict.sh + +``` +In `predict.sh`, user should offer the network config file, model path, label file, word dictionary file, feature file +``` +python predict.py + -c $config_file + -w $model_path + -l $label_file + -d $dict_file + -i $input_file +``` + +`predict.py` is the main executable python script, which includes functions: load model, load data, data prediction. The network model will output the probability distribution of labels. In the demo, we take the label with maximum probability as result. User can also implement the beam search or viterbi decoding upon the probability distribution matrix. + +After prediction, the result is saved in `predict.res`. + +## Reference +[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. + +[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015. diff --git a/doc/index.md b/doc/index.md index a4dffb0405a6b23c88473307a1d199e3caaadf55..cbd08ba52abe529aec84f6b1c2e35300496878a5 100644 --- a/doc/index.md +++ b/doc/index.md @@ -8,7 +8,7 @@ User Guide * [Build and Installation](build/index.rst) * [Contribute Code](build/contribute_to_paddle.md) * [User Interface](ui/index.md) -* [Model Config Interface](ui/api/trainer_config_helpers/index.md) +* [Model Config Interface](ui/api/trainer_config_helpers/index.rst) * [Example and Demo](demo/index.md) * [Cluster Train](cluster/index.md) diff --git a/doc/introduction/index.md b/doc/introduction/index.md index 004ca07844da0fdbea359508c9fae1012aaad421..01f52031a1d0247cd0b885218c17001f23685239 100644 --- a/doc/introduction/index.md +++ b/doc/introduction/index.md @@ -98,4 +98,3 @@ There, you have recovered the underlying pattern between `X` and `Y` only from o - Build and Installation - Quick Start - Example and Demo - diff --git a/doc/ui/api/trainer_config_helpers/activations.rst b/doc/ui/api/trainer_config_helpers/activations.rst index 070ed03ab6cc938f735667701bd46eec33ea77b4..269e6491e7ebe3899c3fb24fca756a393043473b 100644 --- a/doc/ui/api/trainer_config_helpers/activations.rst +++ b/doc/ui/api/trainer_config_helpers/activations.rst @@ -1,3 +1,7 @@ +=========== +Activations +=========== + BaseActivation ============== @@ -102,4 +106,3 @@ STanhActivation .. automodule:: paddle.trainer_config_helpers.activations :members: STanhActivation :noindex: - diff --git a/doc/ui/api/trainer_config_helpers/activations_index.rst b/doc/ui/api/trainer_config_helpers/activations_index.rst deleted file mode 100644 index 1c0b71ab77eec62859c1d7615f6ebe637f3108ac..0000000000000000000000000000000000000000 --- a/doc/ui/api/trainer_config_helpers/activations_index.rst +++ /dev/null @@ -1,7 +0,0 @@ -Activations -=========== - -.. toctree:: - :maxdepth: 3 - - activations.rst diff --git a/doc/ui/api/trainer_config_helpers/evaluators.rst b/doc/ui/api/trainer_config_helpers/evaluators.rst index 0586c9907e472dd98c5f7e9098251f3bc6b88bab..d6a79c13e2316b0fd3d53eb47960a767bcf8abdb 100644 --- a/doc/ui/api/trainer_config_helpers/evaluators.rst +++ b/doc/ui/api/trainer_config_helpers/evaluators.rst @@ -1,3 +1,7 @@ +========== +Evaluators +========== + Base ==== .. automodule:: paddle.trainer_config_helpers.evaluators diff --git a/doc/ui/api/trainer_config_helpers/evaluators_index.rst b/doc/ui/api/trainer_config_helpers/evaluators_index.rst deleted file mode 100644 index 298de3e1a32d36b9102f5ad64cc1b968f418041b..0000000000000000000000000000000000000000 --- a/doc/ui/api/trainer_config_helpers/evaluators_index.rst +++ /dev/null @@ -1,7 +0,0 @@ -Evaluators -========== - -.. toctree:: - :maxdepth: 3 - - evaluators.rst diff --git a/doc/ui/api/trainer_config_helpers/index.md b/doc/ui/api/trainer_config_helpers/index.md deleted file mode 100644 index 00fa99bb3fa4c407dc867f91f4c7c495dc4061a1..0000000000000000000000000000000000000000 --- a/doc/ui/api/trainer_config_helpers/index.md +++ /dev/null @@ -1,10 +0,0 @@ -# Model Config Interface - -* [Optimizer](optimizers_index.rst) -* [Data Source](data_sources.rst) -* [Layers](layers_index.rst) -* [Activations](activations_index.rst) -* [Poolings](poolings_index.rst) -* [Networks](networks_index.rst) -* [Evaluators](evaluators_index.rst) -* [Parameter and Extra Layer Attribute](attrs.rst) diff --git a/doc/ui/api/trainer_config_helpers/index.rst b/doc/ui/api/trainer_config_helpers/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..8395eb75710b3e67ec0c5442f79c999bdacdff42 --- /dev/null +++ b/doc/ui/api/trainer_config_helpers/index.rst @@ -0,0 +1,14 @@ +Model Config Interface +====================== + +.. toctree:: + :maxdepth: 1 + + optimizers.rst + data_sources.rst + layers.rst + activations.rst + poolings.rst + networks.rst + evaluators.rst + attrs.rst diff --git a/doc/ui/api/trainer_config_helpers/layers.rst b/doc/ui/api/trainer_config_helpers/layers.rst index 56b23640205ec8c7575541bd270720fb861457a1..4a02af396993207d305be488c993ce94cf20fe1d 100644 --- a/doc/ui/api/trainer_config_helpers/layers.rst +++ b/doc/ui/api/trainer_config_helpers/layers.rst @@ -1,3 +1,7 @@ +====== +Layers +====== + Base ====== @@ -47,7 +51,7 @@ conv_operator :noindex: conv_projection -------------- +--------------- .. automodule:: paddle.trainer_config_helpers.layers :members: conv_projection :noindex: @@ -187,6 +191,12 @@ embedding_layer :members: embedding_layer :noindex: +scaling_projection +----------------- +.. automodule:: paddle.trainer_config_helpers.layers + :members: scaling_projection + :noindex: + dotmul_projection ----------------- .. automodule:: paddle.trainer_config_helpers.layers diff --git a/doc/ui/api/trainer_config_helpers/layers_index.rst b/doc/ui/api/trainer_config_helpers/layers_index.rst deleted file mode 100644 index c0daab152148ce769948f600c3101bd79f5a1013..0000000000000000000000000000000000000000 --- a/doc/ui/api/trainer_config_helpers/layers_index.rst +++ /dev/null @@ -1,7 +0,0 @@ -Layers -====== - -.. toctree:: - :maxdepth: 3 - - layers.rst diff --git a/doc/ui/api/trainer_config_helpers/networks.rst b/doc/ui/api/trainer_config_helpers/networks.rst index 2a15b34eaea0b763f992a7225550e6af747f303c..29c52c5ce3078f1755162dbbdd65a059d8ba9fa4 100644 --- a/doc/ui/api/trainer_config_helpers/networks.rst +++ b/doc/ui/api/trainer_config_helpers/networks.rst @@ -1,3 +1,9 @@ +======== +Networks +======== + +The networks module contains pieces of neural network that combine multiple layers. + NLP === @@ -111,4 +117,3 @@ outputs .. automodule:: paddle.trainer_config_helpers.networks :members: outputs :noindex: - diff --git a/doc/ui/api/trainer_config_helpers/networks_index.rst b/doc/ui/api/trainer_config_helpers/networks_index.rst deleted file mode 100644 index 17bc4dfaa6c4ed3cd5daf0476d0d4c15a2067a22..0000000000000000000000000000000000000000 --- a/doc/ui/api/trainer_config_helpers/networks_index.rst +++ /dev/null @@ -1,9 +0,0 @@ -Networks -======== - -The networks module contains pieces of neural network that combine multiple layers. - -.. toctree:: - :maxdepth: 3 - - networks.rst diff --git a/doc/ui/api/trainer_config_helpers/optimizers.rst b/doc/ui/api/trainer_config_helpers/optimizers.rst index b487fec64c4ebb5cfbdff1aa101d9b3675776a2c..7ca4e34156e273caf66cc71e6927bfb23bb5235e 100644 --- a/doc/ui/api/trainer_config_helpers/optimizers.rst +++ b/doc/ui/api/trainer_config_helpers/optimizers.rst @@ -1,3 +1,7 @@ +========== +Optimizers +========== + BaseSGDOptimizer ================ .. automodule:: paddle.trainer_config_helpers.optimizers @@ -51,4 +55,3 @@ settings .. automodule:: paddle.trainer_config_helpers.optimizers :members: settings :noindex: - diff --git a/doc/ui/api/trainer_config_helpers/optimizers_index.rst b/doc/ui/api/trainer_config_helpers/optimizers_index.rst deleted file mode 100644 index f39f94f0cd6e1a6c3c25eeceb7820a7fbc070570..0000000000000000000000000000000000000000 --- a/doc/ui/api/trainer_config_helpers/optimizers_index.rst +++ /dev/null @@ -1,7 +0,0 @@ -Optimizers -========== - -.. toctree:: - :maxdepth: 3 - - optimizers.rst diff --git a/doc/ui/api/trainer_config_helpers/poolings.rst b/doc/ui/api/trainer_config_helpers/poolings.rst index caadec639383aad24ed477d8bdaeaa31c0026bb5..66566809d26f59263597b5286c5b27e0bbc9415a 100644 --- a/doc/ui/api/trainer_config_helpers/poolings.rst +++ b/doc/ui/api/trainer_config_helpers/poolings.rst @@ -1,3 +1,7 @@ +======== +Poolings +======== + BasePoolingType =============== .. automodule:: paddle.trainer_config_helpers.poolings @@ -27,4 +31,3 @@ SquareRootNPooling .. automodule:: paddle.trainer_config_helpers.poolings :members: SquareRootNPooling :noindex: - diff --git a/doc/ui/api/trainer_config_helpers/poolings_index.rst b/doc/ui/api/trainer_config_helpers/poolings_index.rst deleted file mode 100644 index 250d3fa69c0dcedfd689b685fe7b47ec71d02fee..0000000000000000000000000000000000000000 --- a/doc/ui/api/trainer_config_helpers/poolings_index.rst +++ /dev/null @@ -1,9 +0,0 @@ -Poolings -======== - -These pooling types are used for sequence input, not for images. - -.. toctree:: - :maxdepth: 3 - - poolings.rst diff --git a/doc_cn/algorithm/rnn/hierarchical-layer.md b/doc_cn/algorithm/rnn/hierarchical-layer.md index 5282bbbcb82d00f5aed7b784d2bd44f9ec33fa42..519653df081d6e7919ada3cbff6aaf4d2a2f6115 100644 --- a/doc_cn/algorithm/rnn/hierarchical-layer.md +++ b/doc_cn/algorithm/rnn/hierarchical-layer.md @@ -1,66 +1,66 @@ -# 支持双层序列作为输入的Layer - -## 概述 - -在自然语言处理任务中,序列是一种常见的数据类型。一个独立的词语,可以看作是一个非序列输入,或者,我们称之为一个0层的序列;由词语构成的句子,是一个单层序列;若干个句子构成一个段落,是一个双层的序列。 - -双层序列是一个嵌套的序列,它的每一个元素,又是一个单层的序列。这是一种非常灵活的数据组织方式,帮助我们构造一些复杂的输入信息。 - -我们可以按照如下层次定义非序列,单层序列,以及双层序列。 - -+ 0层序列:一个独立的元素,类型可以是PaddlePaddle支持的任意输入数据类型 -+ 单层序列:排成一列的多个元素,每个元素是一个0层序列,元素之间的顺序是重要的输入信息 -+ 双层序列:排成一列的多个元素,每个元素是一个单层序列,称之为双层序列的一个子序列(subseq),subseq的每个元素是一个0层序列 - - -在 PaddlePaddle中,下面这些Layer能够接受双层序列作为输入,完成相应的计算。 -## pooling_layer - -pooling_layer的使用示例如下,详细见配置API。 -```python -seq_pool = pooling_layer(input=layer, - pooling_type=AvgPooling(), - agg_level=AggregateLevel.EACH_SEQUENCE) -``` -- `pooling_type` 目前支持两种,分别是:MaxPooling()和AvgPooling()。 -- `agg_level=AggregateLevel.TIMESTEP`时(默认值): - - 作用:双层序列经过运算变成一个0层序列,或单层序列经过运算变成一个0层序列 - - 输入:一个双层序列,或一个单层序列 - - 输出:一个0层序列,即整个输入序列(单层或双层)的平均值(或最大值) -- `agg_level=AggregateLevel.EACH_SEQUENCE`时: - - 作用:一个双层序列经过运算变成一个单层序列 - - 输入:必须是一个双层序列 - - 输出:一个单层序列,序列的每个元素是原来双层序列每个subseq元素的平均值(或最大值) - -## last_seq 和 first_seq - -last_seq的使用示例如下(first_seq类似),详细见配置API。 -```python -last = last_seq(input=layer, - agg_level=AggregateLevel.EACH_SEQUENCE) -``` -- `agg_level=AggregateLevel.TIMESTEP`时(默认值): - - 作用:一个双层序列经过运算变成一个0层序列,或一个单层序列经过运算变成一个0层序列 - - 输入:一个双层序列或一个单层序列 - - 输出:一个0层序列,即整个输入序列(双层或者单层)最后一个,或第一个元素。 -- `agg_level=AggregateLevel.EACH_SEQUENCE`时: - - 作用:一个双层序列经过运算变成一个单层序列 - - 输入:必须是一个双层序列 - - 输出:一个单层序列,其中每个元素是双层序列中每个subseq最后一个(或第一个)元素。 - -## expand_layer - -expand_layer的使用示例如下,详细见配置API。 -```python -expand = expand_layer(input=layer1, - expand_as=layer2, - expand_level=ExpandLevel.FROM_TIMESTEP) -``` -- `expand_level=ExpandLevel.FROM_TIMESTEP`时(默认值): - - 作用:一个0层序列经过运算扩展成一个单层序列,或者一个双层序列 - - 输入:layer1必须是一个0层序列,是待扩展的数据;layer2可以是一个单层序列,或者是一个双层序列,提供扩展的长度信息 - - 输出:一个单层序列,或一个双层序列,输出序列的类型(双层序列,或单层序列)和序列中含有元素的数目同 layer2一致。若输出是单层序列,单层序列的每个元素(0层序列),都是对layer1元素的拷贝;若输出是双层序列,双层序列每个subseq中每个元素(0层序列),都是对layer1元素的拷贝 -- `expand_level=ExpandLevel.FROM_SEQUENCE`时: - - 作用:一个单层序列经过运算扩展成一个双层序列 - - 输入:layer1必须是一个单层序列,是待扩展的数据;layer2必须是一个双层序列,提供扩展的长度信息 - - 输出:一个双层序列,序列中含有元素的数目同layer2一致。要求单层序列含有元素的数目(0层序列),和双层序列含有subseq 的数目一致。单层序列第i个元素(0层序列),被扩展为一个单层序列,构成了输出双层序列的第i个subseq。 \ No newline at end of file +# 支持双层序列作为输入的Layer + +## 概述 + +在自然语言处理任务中,序列是一种常见的数据类型。一个独立的词语,可以看作是一个非序列输入,或者,我们称之为一个0层的序列;由词语构成的句子,是一个单层序列;若干个句子构成一个段落,是一个双层的序列。 + +双层序列是一个嵌套的序列,它的每一个元素,又是一个单层的序列。这是一种非常灵活的数据组织方式,帮助我们构造一些复杂的输入信息。 + +我们可以按照如下层次定义非序列,单层序列,以及双层序列。 + ++ 0层序列:一个独立的元素,类型可以是PaddlePaddle支持的任意输入数据类型 ++ 单层序列:排成一列的多个元素,每个元素是一个0层序列,元素之间的顺序是重要的输入信息 ++ 双层序列:排成一列的多个元素,每个元素是一个单层序列,称之为双层序列的一个子序列(subseq),subseq的每个元素是一个0层序列 + + +在 PaddlePaddle中,下面这些Layer能够接受双层序列作为输入,完成相应的计算。 +## pooling_layer + +pooling_layer的使用示例如下,详细见配置API。 +```python +seq_pool = pooling_layer(input=layer, + pooling_type=AvgPooling(), + agg_level=AggregateLevel.EACH_SEQUENCE) +``` +- `pooling_type` 目前支持两种,分别是:MaxPooling()和AvgPooling()。 +- `agg_level=AggregateLevel.TIMESTEP`时(默认值): + - 作用:双层序列经过运算变成一个0层序列,或单层序列经过运算变成一个0层序列 + - 输入:一个双层序列,或一个单层序列 + - 输出:一个0层序列,即整个输入序列(单层或双层)的平均值(或最大值) +- `agg_level=AggregateLevel.EACH_SEQUENCE`时: + - 作用:一个双层序列经过运算变成一个单层序列 + - 输入:必须是一个双层序列 + - 输出:一个单层序列,序列的每个元素是原来双层序列每个subseq元素的平均值(或最大值) + +## last_seq 和 first_seq + +last_seq的使用示例如下(first_seq类似),详细见配置API。 +```python +last = last_seq(input=layer, + agg_level=AggregateLevel.EACH_SEQUENCE) +``` +- `agg_level=AggregateLevel.TIMESTEP`时(默认值): + - 作用:一个双层序列经过运算变成一个0层序列,或一个单层序列经过运算变成一个0层序列 + - 输入:一个双层序列或一个单层序列 + - 输出:一个0层序列,即整个输入序列(双层或者单层)最后一个,或第一个元素。 +- `agg_level=AggregateLevel.EACH_SEQUENCE`时: + - 作用:一个双层序列经过运算变成一个单层序列 + - 输入:必须是一个双层序列 + - 输出:一个单层序列,其中每个元素是双层序列中每个subseq最后一个(或第一个)元素。 + +## expand_layer + +expand_layer的使用示例如下,详细见配置API。 +```python +expand = expand_layer(input=layer1, + expand_as=layer2, + expand_level=ExpandLevel.FROM_TIMESTEP) +``` +- `expand_level=ExpandLevel.FROM_TIMESTEP`时(默认值): + - 作用:一个0层序列经过运算扩展成一个单层序列,或者一个双层序列 + - 输入:layer1必须是一个0层序列,是待扩展的数据;layer2可以是一个单层序列,或者是一个双层序列,提供扩展的长度信息 + - 输出:一个单层序列,或一个双层序列,输出序列的类型(双层序列,或单层序列)和序列中含有元素的数目同 layer2一致。若输出是单层序列,单层序列的每个元素(0层序列),都是对layer1元素的拷贝;若输出是双层序列,双层序列每个subseq中每个元素(0层序列),都是对layer1元素的拷贝 +- `expand_level=ExpandLevel.FROM_SEQUENCE`时: + - 作用:一个单层序列经过运算扩展成一个双层序列 + - 输入:layer1必须是一个单层序列,是待扩展的数据;layer2必须是一个双层序列,提供扩展的长度信息 + - 输出:一个双层序列,序列中含有元素的数目同layer2一致。要求单层序列含有元素的数目(0层序列),和双层序列含有subseq 的数目一致。单层序列第i个元素(0层序列),被扩展为一个单层序列,构成了输出双层序列的第i个subseq。 diff --git a/doc_cn/algorithm/rnn/hierarchical-rnn.md b/doc_cn/algorithm/rnn/hierarchical-rnn.md index 4a85cf336146ef368b04c13fdc74f39ee7a361d3..c184a34e85a571e98e88c14ef653356fdd555a19 100644 --- a/doc_cn/algorithm/rnn/hierarchical-rnn.md +++ b/doc_cn/algorithm/rnn/hierarchical-rnn.md @@ -1,403 +1,403 @@ -# 双层RNN配置与示例 - -我们在`paddle/gserver/tests/test_RecurrentGradientMachine`单测中,通过多组语义相同的单双层RNN配置,讲解如何使用双层RNN。 - -## 示例1:双进双出,subseq间无memory - -配置:单层RNN(`sequence_layer_group`)和双层RNN(`sequence_nest_layer_group`),语义完全相同。 - -### 读取双层序列的方法 - -首先,我们看一下单双层序列的不同数据组织形式(您也可以采用别的组织形式): - -- 单层序列的数据(`Sequence/tour_train_wdseg`)如下,一共有10个样本。每个样本由两部分组成,一个label(此处都为2)和一个已经分词后的句子。 - -```text -2 酒店 有 很 舒适 的 床垫 子 , 床上用品 也 应该 是 一人 一 换 , 感觉 很 利落 对 卫生 很 放心 呀 。 -2 很 温馨 , 也 挺 干净 的 * 地段 不错 , 出来 就 有 全家 , 离 地铁站 也 近 , 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 , 就 第一天 给 了 一次性杯子 * -2 位置 方便 , 强烈推荐 , 十一 出去玩 的 时候 选 的 , 对面 就是 华润万家 , 周围 吃饭 的 也 不少 。 -2 交通便利 , 吃 很 便利 , 乾 浄 、 安静 , 商务 房 有 电脑 、 上网 快 , 价格 可以 , 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。 -2 本来 准备 住 两 晚 , 第 2 天 一早 居然 停电 , 且 无 通知 , 只有 口头 道歉 。 总体来说 性价比 尚可 , 房间 较 新 , 还是 推荐 . -2 这个 酒店 去过 很多 次 了 , 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店 -2 挺好 的 汉庭 , 前台 服务 很 热情 , 卫生 很 整洁 , 房间 安静 , 水温 适中 , 挺好 ! -2 HowardJohnson 的 品质 , 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 , 简直 一 流 。 就 在 天一阁 、 月湖 旁边 , 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。 -2 酒店 很干净 , 很安静 , 很 温馨 , 服务员 服务 好 , 各方面 都 不错 * -2 挺好 的 , 就是 没 窗户 , 不过 对 得 起 这 价格 -``` - -- 双层序列的数据(`Sequence/tour_train_wdseg.nest`)如下,一共有4个样本。样本间用空行分开,代表不同的双层序列,序列数据和上面的完全一样。每个样本的子句数分别为2,3,2,3。 - -```text -2 酒店 有 很 舒适 的 床垫 子 , 床上用品 也 应该 是 一人 一 换 , 感觉 很 利落 对 卫生 很 放心 呀 。 -2 很 温馨 , 也 挺 干净 的 * 地段 不错 , 出来 就 有 全家 , 离 地铁站 也 近 , 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 , 就 第一天 给 了 一次性杯子 * - -2 位置 方便 , 强烈推荐 , 十一 出去玩 的 时候 选 的 , 对面 就是 华润万家 , 周围 吃饭 的 也 不少 。 -2 交通便利 , 吃 很 便利 , 乾 浄 、 安静 , 商务 房 有 电脑 、 上网 快 , 价格 可以 , 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。 -2 本来 准备 住 两 晚 , 第 2 天 一早 居然 停电 , 且 无 通知 , 只有 口头 道歉 。 总体来说 性价比 尚可 , 房间 较 新 , 还是 推荐 . - -2 这个 酒店 去过 很多 次 了 , 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店 -2 挺好 的 汉庭 , 前台 服务 很 热情 , 卫生 很 整洁 , 房间 安静 , 水温 适中 , 挺好 ! - -2 HowardJohnson 的 品质 , 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 , 简直 一 流 。 就 在 天一阁 、 月湖 旁边 , 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。 -2 酒店 很干净 , 很安静 , 很 温馨 , 服务员 服务 好 , 各方面 都 不错 * -2 挺好 的 , 就是 没 窗户 , 不过 对 得 起 这 价格 -``` - -其次,我们看一下单双层序列的不同dataprovider(见`sequenceGen.py`): - -- 单层序列的dataprovider如下: - - word_slot是integer_value_sequence类型,代表单层序列。 - - label是integer_value类型,代表一个向量。 - -```python -def hook(settings, dict_file, **kwargs): - settings.word_dict = dict_file - settings.input_types = [integer_value_sequence(len(settings.word_dict)), - integer_value(3)] - -@provider(init_hook=hook) -def process(settings, file_name): - with open(file_name, 'r') as fdata: - for line in fdata: - label, comment = line.strip().split('\t') - label = int(''.join(label.split())) - words = comment.split() - word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict] - yield word_slot, label -``` - -- 双层序列的dataprovider如下: - - word_slot是integer_value_sub_sequence类型,代表双层序列。 - - label是integer_value_sequence类型,代表单层序列,即一个子句一个label。注意:也可以为integer_value类型,代表一个向量,即一个句子一个label。通常根据任务需求进行不同设置。 - - 关于dataprovider中input_types的详细用法,参见PyDataProvider2。 - -```python -def hook2(settings, dict_file, **kwargs): - settings.word_dict = dict_file - settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)), - integer_value_sequence(3)] - -@provider(init_hook=hook2) -def process2(settings, file_name): - with open(file_name) as fdata: - label_list = [] - word_slot_list = [] - for line in fdata: - if (len(line)) > 1: - label,comment = line.strip().split('\t') - label = int(''.join(label.split())) - words = comment.split() - word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict] - label_list.append(label) - word_slot_list.append(word_slot) - else: - yield word_slot_list, label_list - label_list = [] - word_slot_list = [] -``` - -### 模型中的配置 - -首先,我们看一下单层序列的配置(见`sequence_layer_group.conf`)。注意:batchsize=5表示一次过5句单层序列,因此2个batch就可以完成1个pass。 - -```python -settings(batch_size=5) - -data = data_layer(name="word", size=dict_dim) - -emb = embedding_layer(input=data, size=word_dim) - -# (lstm_input + lstm) is equal to lstmemory -with mixed_layer(size=hidden_dim*4) as lstm_input: - lstm_input += full_matrix_projection(input=emb) - -lstm = lstmemory_group(input=lstm_input, - size=hidden_dim, - act=TanhActivation(), - gate_act=SigmoidActivation(), - state_act=TanhActivation(), - lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50)) - -lstm_last = last_seq(input=lstm) - -with mixed_layer(size=label_dim, - act=SoftmaxActivation(), - bias_attr=True) as output: - output += full_matrix_projection(input=lstm_last) - -outputs(classification_cost(input=output, label=data_layer(name="label", size=1))) - -``` -其次,我们看一下语义相同的双层序列配置(见`sequence_nest_layer_group.conf`),并对其详细分析: - -- batchsize=2表示一次过2句双层序列。但从上面的数据格式可知,2句双层序列和5句单层序列的数据完全一样。 -- data_layer和embedding_layer不关心数据是否是序列格式,因此两个配置在这两层上的输出是一样的。 -- lstmemory: - - 单层序列过了一个mixed_layer和lstmemory_group。 - - 双层序列在同样的mixed_layer和lstmemory_group外,直接加了一层group。由于这个外层group里面没有memory,表示subseq间不存在联系,即起到的作用仅仅是把双层seq拆成单层,因此双层序列过完lstmemory的输出和单层的一样。 -- last_seq: - - 单层序列直接取了最后一个元素 - - 双层序列首先(last_seq层)取了每个subseq的最后一个元素,将其拼接成一个新的单层序列;接着(expand_layer层)将其扩展成一个新的双层序列,其中第i个subseq中的所有向量均为输入的单层序列中的第i个向量;最后(average_layer层)取了每个subseq的平均值。 - - 分析得出:第一个last_seq后,每个subseq的最后一个元素就等于单层序列的最后一个元素,而expand_layer和average_layer后,依然保持每个subseq最后一个元素的值不变(这两层仅是为了展示它们的用法,实际中并不需要)。因此单双层序列的输出是一样旳。 - -```python -settings(batch_size=2) - -data = data_layer(name="word", size=dict_dim) - -emb_group = embedding_layer(input=data, size=word_dim) - -# (lstm_input + lstm) is equal to lstmemory -def lstm_group(lstm_group_input): - with mixed_layer(size=hidden_dim*4) as group_input: - group_input += full_matrix_projection(input=lstm_group_input) - - lstm_output = lstmemory_group(input=group_input, - name="lstm_group", - size=hidden_dim, - act=TanhActivation(), - gate_act=SigmoidActivation(), - state_act=TanhActivation(), - lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50)) - return lstm_output - -lstm_nest_group = recurrent_group(input=SubsequenceInput(emb_group), - step=lstm_group, - name="lstm_nest_group") -# hasSubseq ->(seqlastins) seq -lstm_last = last_seq(input=lstm_nest_group, agg_level=AggregateLevel.EACH_SEQUENCE) - -# seq ->(expand) hasSubseq -lstm_expand = expand_layer(input=lstm_last, expand_as=emb_group, expand_level=ExpandLevel.FROM_SEQUENCE) - -# hasSubseq ->(average) seq -lstm_average = pooling_layer(input=lstm_expand, - pooling_type=AvgPooling(), - agg_level=AggregateLevel.EACH_SEQUENCE) - -with mixed_layer(size=label_dim, - act=SoftmaxActivation(), - bias_attr=True) as output: - output += full_matrix_projection(input=lstm_average) - -outputs(classification_cost(input=output, label=data_layer(name="label", size=1))) -``` -## 示例2:双进双出,subseq间有memory - -配置:单层RNN(`sequence_rnn.conf`),双层RNN(`sequence_nest_rnn.conf`和`sequence_nest_rnn_readonly_memory.conf`),语义完全相同。 - -### 读取双层序列的方法 - -我们看一下单双层序列的不同数据组织形式和dataprovider(见`rnn_data_provider.py`) -```python -data = [ - [[[1, 3, 2], [4, 5, 2]], 0], - [[[0, 2], [2, 5], [0, 1, 2]], 1], -] - -@provider(input_types=[integer_value_sub_sequence(10), - integer_value(3)]) -def process_subseq(settings, file_name): - for d in data: - yield d - -@provider(input_types=[integer_value_sequence(10), - integer_value(3)]) -def process_seq(settings, file_name): - for d in data: - seq = [] -``` -- 单层序列:有两句,分别为[1,3,2,4,5,2]和[0,2,2,5,0,1,2]。 -- 双层序列:有两句,分别为[[1,3,2],[4,5,2]](2个子句)和[[0,2],[2,5],[0,1,2]](3个子句)。 -- 单双层序列的label都分别是0和1 - -### 模型中的配置 - -我们选取单双层序列配置中的不同部分,来对比分析两者语义相同的原因。 - -- 单层序列:过了一个很简单的recurrent_group。每一个时间步,当前的输入y和上一个时间步的输出rnn_state做了一个全链接。 - -```python -def step(y): - mem = memory(name="rnn_state", size=hidden_dim) - return fc_layer(input=[y, mem], - size=hidden_dim, - act=TanhActivation(), - bias_attr=True, - name="rnn_state") - -out = recurrent_group(step=step, input=emb) -``` -- 双层序列,外层memory是一个元素: - - 内层inner_step的recurrent_group和单层序列的几乎一样。除了boot_layer=outer_mem,表示将外层的outer_mem作为内层memory的初始状态。外层outer_step中,outer_mem是一个子句的最后一个向量,即整个双层group是将前一个子句的最后一个向量,作为下一个子句memory的初始状态。 - - 从输入数据上看,单双层序列的句子是一样的,只是双层序列将其又做了子序列划分。因此双层序列的配置中,必须将前一个子句的最后一个元素,作为boot_layer传给下一个子句的memory,才能保证和单层序列的配置中“每一个时间步都用了上一个时间步的输出结果”一致。 - -```python -def outer_step(x): - outer_mem = memory(name="outer_rnn_state", size=hidden_dim) - def inner_step(y): - inner_mem = memory(name="inner_rnn_state", - size=hidden_dim, - boot_layer=outer_mem) - return fc_layer(input=[y, inner_mem], - size=hidden_dim, - act=TanhActivation(), - bias_attr=True, - name="inner_rnn_state") - - inner_rnn_output = recurrent_group( - step=inner_step, - input=x) - last = last_seq(input=inner_rnn_output, name="outer_rnn_state") - - return inner_rnn_output - -out = recurrent_group(step=outer_step, input=SubsequenceInput(emb)) -``` -- 双层序列,外层memory是单层序列: - - 由于外层每个时间步返回的是一个子句,这些子句的长度往往不等长。因此当外层有is_seq=True的memory时,内层是**无法直接使用**它的,即内层memory的boot_layer不能链接外层的这个memory。 - - 如果内层memory想**间接使用**这个外层memory,只能通过`pooling_layer`、`last_seq`或`first_seq`这三个layer将它先变成一个元素。但这种情况下,外层memory必须有boot_layer,否则在第0个时间步时,由于外层memory没有任何seq信息,因此上述三个layer的前向会报出“**Check failed: input.sequenceStartPositions**”的错误。 - -## 示例3:双进双出,输入不等长 - -**输入不等长**是指recurrent_group的多个输入在各时刻的长度可以不相等, 但需要指定一个和输出长度一致的input,用targetInlink表示。参考配置:单层RNN(`sequence_rnn_multi_unequalength_inputs.conf`),双层RNN(`sequence_nest_rnn_multi_unequalength_inputs.conf`) - -### 读取双层序列的方法 - -我们看一下单双层序列的数据组织形式和dataprovider(见`rnn_data_provider.py`) -```python -data2 = [ - [[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]] ,0], - [[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]], 1], -] - -@provider(input_types=[integer_value_sub_sequence(10), - integer_value_sub_sequence(10), - integer_value(2)], - should_shuffle=False) -def process_unequalength_subseq(settings, file_name): #双层RNN的dataprovider - for d in data2: - yield d - - -@provider(input_types=[integer_value_sequence(10), - integer_value_sequence(10), - integer_value(2)], - should_shuffle=False) -def process_unequalength_seq(settings, file_name): #单层RNN的dataprovider - for d in data2: - words1=reduce(lambda x,y: x+y, d[0]) - words2=reduce(lambda x,y: x+y, d[1]) - yield words1, words2, d[2] -``` - -data2 中有两个样本,每个样本有两个特征, 记fea1, fea2。 - -- 单层序列:两个样本分别为[[1, 2, 4, 5, 2], [5, 4, 1, 3, 1]] 和 [[0, 2, 2, 5, 0, 1, 2], [1, 5, 4, 2, 3, 6, 1]] -- 双层序列:两个样本分别为 - - **样本1**:[[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]]]。fea1和fea2都分别有2个子句,fea1=[[1, 2], [4, 5, 2]], fea2=[[5, 4, 1], [3, 1]] - - **样本2**:[[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]]]。fea1和fea2都分别有3个子句, fea1=[[0, 2], [2, 5], [0, 1, 2]], fea2=[[1, 5], [4], [2, 3, 6, 1]]。
- - **注意**:每个样本中,各特征的子句数目需要相等。这里说的“双进双出,输入不等长”是指fea1在i时刻的输入的长度可以不等于fea2在i时刻的输入的长度。如对于第1个样本,时刻i=2, fea1[2]=[4, 5, 2],fea2[2]=[3, 1],3≠2。 -- 单双层序列中,两个样本的label都分别是0和1 - -### 模型中的配置 - -单层RNN(`sequence_rnn_multi_unequalength_inputs.conf`)和双层RNN(`sequence_nest_rnn_multi_unequalength_inputs.conf`)两个模型配置达到的效果完全一样,区别只在于输入为单层还是双层序列,现在我们来看它们内部分别是如何实现的。 - -- 单层序列: - - 过了一个简单的recurrent_group。每一个时间步,当前的输入y和上一个时间步的输出rnn_state做了一个全连接,功能与示例2中`sequence_rnn.conf`的`step`函数完全相同。这里,两个输入x1,x2分别通过calrnn返回最后时刻的状态。结果得到的encoder1_rep和encoder2_rep分别是单层序列,最后取encoder1_rep的最后一个时刻和encoder2_rep的所有时刻分别相加得到context。 - - 注意到这里recurrent_group输入的每个样本中,fea1和fea2的长度都分别相等,这并非偶然,而是因为recurrent_group要求输入为单层序列时,所有输入的长度都必须相等。 - -```python -def step(x1, x2): - def calrnn(y): - mem = memory(name = 'rnn_state_' + y.name, size = hidden_dim) - out = fc_layer(input = [y, mem], - size = hidden_dim, - act = TanhActivation(), - bias_attr = True, - name = 'rnn_state_' + y.name) - return out - - encoder1 = calrnn(x1) - encoder2 = calrnn(x2) - return [encoder1, encoder2] - -encoder1_rep, encoder2_rep = recurrent_group( - name="stepout", - step=step, - input=[emb1, emb2]) - -encoder1_last = last_seq(input = encoder1_rep) -encoder1_expandlast = expand_layer(input = encoder1_last, - expand_as = encoder2_rep) -context = mixed_layer(input = [identity_projection(encoder1_expandlast), - identity_projection(encoder2_rep)], - size = hidden_dim) -``` -- 双层序列: - - 双层RNN中,对输入的两个特征分别求时序上的连续全连接(`inner_step1`和`inner_step2`分别处理fea1和fea2),其功能与示例2中`sequence_nest_rnn.conf`的`outer_step`函数完全相同。不同之处是,此时输入`[SubsequenceInput(emb1), SubsequenceInput(emb2)]`在各时刻并不等长。 - - 函数`outer_step`中可以分别处理这两个特征,但我们需要用targetInlink指定recurrent_group的输出的格式(各子句长度)只能和其中一个保持一致,如这里选择了和emb2的长度一致。 - - 最后,依然是取encoder1_rep的最后一个时刻和encoder2_rep的所有时刻分别相加得到context。 - -```python -def outer_step(x1, x2): - outer_mem1 = memory(name = "outer_rnn_state1", size = hidden_dim) - outer_mem2 = memory(name = "outer_rnn_state2", size = hidden_dim) - def inner_step1(y): - inner_mem = memory(name = 'inner_rnn_state_' + y.name, - size = hidden_dim, - boot_layer = outer_mem1) - out = fc_layer(input = [y, inner_mem], - size = hidden_dim, - act = TanhActivation(), - bias_attr = True, - name = 'inner_rnn_state_' + y.name) - return out - - def inner_step2(y): - inner_mem = memory(name = 'inner_rnn_state_' + y.name, - size = hidden_dim, - boot_layer = outer_mem2) - out = fc_layer(input = [y, inner_mem], - size = hidden_dim, - act = TanhActivation(), - bias_attr = True, - name = 'inner_rnn_state_' + y.name) - return out - - encoder1 = recurrent_group( - step = inner_step1, - name = 'inner1', - input = x1) - - encoder2 = recurrent_group( - step = inner_step2, - name = 'inner2', - input = x2) - - sentence_last_state1 = last_seq(input = encoder1, name = 'outer_rnn_state1') - sentence_last_state2_ = last_seq(input = encoder2, name = 'outer_rnn_state2') - - encoder1_expand = expand_layer(input = sentence_last_state1, - expand_as = encoder2) - - return [encoder1_expand, encoder2] - -encoder1_rep, encoder2_rep = recurrent_group( - name="outer", - step=outer_step, - input=[SubsequenceInput(emb1), SubsequenceInput(emb2)], - targetInlink=emb2) - -encoder1_last = last_seq(input = encoder1_rep) -encoder1_expandlast = expand_layer(input = encoder1_last, - expand_as = encoder2_rep) -context = mixed_layer(input = [identity_projection(encoder1_expandlast), - identity_projection(encoder2_rep)], - size = hidden_dim) -``` - -## 示例4:beam_search的生成 - -TBD \ No newline at end of file +# 双层RNN配置与示例 + +我们在`paddle/gserver/tests/test_RecurrentGradientMachine`单测中,通过多组语义相同的单双层RNN配置,讲解如何使用双层RNN。 + +## 示例1:双进双出,subseq间无memory + +配置:单层RNN(`sequence_layer_group`)和双层RNN(`sequence_nest_layer_group`),语义完全相同。 + +### 读取双层序列的方法 + +首先,我们看一下单双层序列的不同数据组织形式(您也可以采用别的组织形式): + +- 单层序列的数据(`Sequence/tour_train_wdseg`)如下,一共有10个样本。每个样本由两部分组成,一个label(此处都为2)和一个已经分词后的句子。 + +```text +2 酒店 有 很 舒适 的 床垫 子 , 床上用品 也 应该 是 一人 一 换 , 感觉 很 利落 对 卫生 很 放心 呀 。 +2 很 温馨 , 也 挺 干净 的 * 地段 不错 , 出来 就 有 全家 , 离 地铁站 也 近 , 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 , 就 第一天 给 了 一次性杯子 * +2 位置 方便 , 强烈推荐 , 十一 出去玩 的 时候 选 的 , 对面 就是 华润万家 , 周围 吃饭 的 也 不少 。 +2 交通便利 , 吃 很 便利 , 乾 浄 、 安静 , 商务 房 有 电脑 、 上网 快 , 价格 可以 , 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。 +2 本来 准备 住 两 晚 , 第 2 天 一早 居然 停电 , 且 无 通知 , 只有 口头 道歉 。 总体来说 性价比 尚可 , 房间 较 新 , 还是 推荐 . +2 这个 酒店 去过 很多 次 了 , 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店 +2 挺好 的 汉庭 , 前台 服务 很 热情 , 卫生 很 整洁 , 房间 安静 , 水温 适中 , 挺好 ! +2 HowardJohnson 的 品质 , 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 , 简直 一 流 。 就 在 天一阁 、 月湖 旁边 , 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。 +2 酒店 很干净 , 很安静 , 很 温馨 , 服务员 服务 好 , 各方面 都 不错 * +2 挺好 的 , 就是 没 窗户 , 不过 对 得 起 这 价格 +``` + +- 双层序列的数据(`Sequence/tour_train_wdseg.nest`)如下,一共有4个样本。样本间用空行分开,代表不同的双层序列,序列数据和上面的完全一样。每个样本的子句数分别为2,3,2,3。 + +```text +2 酒店 有 很 舒适 的 床垫 子 , 床上用品 也 应该 是 一人 一 换 , 感觉 很 利落 对 卫生 很 放心 呀 。 +2 很 温馨 , 也 挺 干净 的 * 地段 不错 , 出来 就 有 全家 , 离 地铁站 也 近 , 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 , 就 第一天 给 了 一次性杯子 * + +2 位置 方便 , 强烈推荐 , 十一 出去玩 的 时候 选 的 , 对面 就是 华润万家 , 周围 吃饭 的 也 不少 。 +2 交通便利 , 吃 很 便利 , 乾 浄 、 安静 , 商务 房 有 电脑 、 上网 快 , 价格 可以 , 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。 +2 本来 准备 住 两 晚 , 第 2 天 一早 居然 停电 , 且 无 通知 , 只有 口头 道歉 。 总体来说 性价比 尚可 , 房间 较 新 , 还是 推荐 . + +2 这个 酒店 去过 很多 次 了 , 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店 +2 挺好 的 汉庭 , 前台 服务 很 热情 , 卫生 很 整洁 , 房间 安静 , 水温 适中 , 挺好 ! + +2 HowardJohnson 的 品质 , 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 , 简直 一 流 。 就 在 天一阁 、 月湖 旁边 , 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。 +2 酒店 很干净 , 很安静 , 很 温馨 , 服务员 服务 好 , 各方面 都 不错 * +2 挺好 的 , 就是 没 窗户 , 不过 对 得 起 这 价格 +``` + +其次,我们看一下单双层序列的不同dataprovider(见`sequenceGen.py`): + +- 单层序列的dataprovider如下: + - word_slot是integer_value_sequence类型,代表单层序列。 + - label是integer_value类型,代表一个向量。 + +```python +def hook(settings, dict_file, **kwargs): + settings.word_dict = dict_file + settings.input_types = [integer_value_sequence(len(settings.word_dict)), + integer_value(3)] + +@provider(init_hook=hook) +def process(settings, file_name): + with open(file_name, 'r') as fdata: + for line in fdata: + label, comment = line.strip().split('\t') + label = int(''.join(label.split())) + words = comment.split() + word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict] + yield word_slot, label +``` + +- 双层序列的dataprovider如下: + - word_slot是integer_value_sub_sequence类型,代表双层序列。 + - label是integer_value_sequence类型,代表单层序列,即一个子句一个label。注意:也可以为integer_value类型,代表一个向量,即一个句子一个label。通常根据任务需求进行不同设置。 + - 关于dataprovider中input_types的详细用法,参见PyDataProvider2。 + +```python +def hook2(settings, dict_file, **kwargs): + settings.word_dict = dict_file + settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)), + integer_value_sequence(3)] + +@provider(init_hook=hook2) +def process2(settings, file_name): + with open(file_name) as fdata: + label_list = [] + word_slot_list = [] + for line in fdata: + if (len(line)) > 1: + label,comment = line.strip().split('\t') + label = int(''.join(label.split())) + words = comment.split() + word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict] + label_list.append(label) + word_slot_list.append(word_slot) + else: + yield word_slot_list, label_list + label_list = [] + word_slot_list = [] +``` + +### 模型中的配置 + +首先,我们看一下单层序列的配置(见`sequence_layer_group.conf`)。注意:batchsize=5表示一次过5句单层序列,因此2个batch就可以完成1个pass。 + +```python +settings(batch_size=5) + +data = data_layer(name="word", size=dict_dim) + +emb = embedding_layer(input=data, size=word_dim) + +# (lstm_input + lstm) is equal to lstmemory +with mixed_layer(size=hidden_dim*4) as lstm_input: + lstm_input += full_matrix_projection(input=emb) + +lstm = lstmemory_group(input=lstm_input, + size=hidden_dim, + act=TanhActivation(), + gate_act=SigmoidActivation(), + state_act=TanhActivation(), + lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50)) + +lstm_last = last_seq(input=lstm) + +with mixed_layer(size=label_dim, + act=SoftmaxActivation(), + bias_attr=True) as output: + output += full_matrix_projection(input=lstm_last) + +outputs(classification_cost(input=output, label=data_layer(name="label", size=1))) + +``` +其次,我们看一下语义相同的双层序列配置(见`sequence_nest_layer_group.conf`),并对其详细分析: + +- batchsize=2表示一次过2句双层序列。但从上面的数据格式可知,2句双层序列和5句单层序列的数据完全一样。 +- data_layer和embedding_layer不关心数据是否是序列格式,因此两个配置在这两层上的输出是一样的。 +- lstmemory: + - 单层序列过了一个mixed_layer和lstmemory_group。 + - 双层序列在同样的mixed_layer和lstmemory_group外,直接加了一层group。由于这个外层group里面没有memory,表示subseq间不存在联系,即起到的作用仅仅是把双层seq拆成单层,因此双层序列过完lstmemory的输出和单层的一样。 +- last_seq: + - 单层序列直接取了最后一个元素 + - 双层序列首先(last_seq层)取了每个subseq的最后一个元素,将其拼接成一个新的单层序列;接着(expand_layer层)将其扩展成一个新的双层序列,其中第i个subseq中的所有向量均为输入的单层序列中的第i个向量;最后(average_layer层)取了每个subseq的平均值。 + - 分析得出:第一个last_seq后,每个subseq的最后一个元素就等于单层序列的最后一个元素,而expand_layer和average_layer后,依然保持每个subseq最后一个元素的值不变(这两层仅是为了展示它们的用法,实际中并不需要)。因此单双层序列的输出是一样旳。 + +```python +settings(batch_size=2) + +data = data_layer(name="word", size=dict_dim) + +emb_group = embedding_layer(input=data, size=word_dim) + +# (lstm_input + lstm) is equal to lstmemory +def lstm_group(lstm_group_input): + with mixed_layer(size=hidden_dim*4) as group_input: + group_input += full_matrix_projection(input=lstm_group_input) + + lstm_output = lstmemory_group(input=group_input, + name="lstm_group", + size=hidden_dim, + act=TanhActivation(), + gate_act=SigmoidActivation(), + state_act=TanhActivation(), + lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50)) + return lstm_output + +lstm_nest_group = recurrent_group(input=SubsequenceInput(emb_group), + step=lstm_group, + name="lstm_nest_group") +# hasSubseq ->(seqlastins) seq +lstm_last = last_seq(input=lstm_nest_group, agg_level=AggregateLevel.EACH_SEQUENCE) + +# seq ->(expand) hasSubseq +lstm_expand = expand_layer(input=lstm_last, expand_as=emb_group, expand_level=ExpandLevel.FROM_SEQUENCE) + +# hasSubseq ->(average) seq +lstm_average = pooling_layer(input=lstm_expand, + pooling_type=AvgPooling(), + agg_level=AggregateLevel.EACH_SEQUENCE) + +with mixed_layer(size=label_dim, + act=SoftmaxActivation(), + bias_attr=True) as output: + output += full_matrix_projection(input=lstm_average) + +outputs(classification_cost(input=output, label=data_layer(name="label", size=1))) +``` +## 示例2:双进双出,subseq间有memory + +配置:单层RNN(`sequence_rnn.conf`),双层RNN(`sequence_nest_rnn.conf`和`sequence_nest_rnn_readonly_memory.conf`),语义完全相同。 + +### 读取双层序列的方法 + +我们看一下单双层序列的不同数据组织形式和dataprovider(见`rnn_data_provider.py`) +```python +data = [ + [[[1, 3, 2], [4, 5, 2]], 0], + [[[0, 2], [2, 5], [0, 1, 2]], 1], +] + +@provider(input_types=[integer_value_sub_sequence(10), + integer_value(3)]) +def process_subseq(settings, file_name): + for d in data: + yield d + +@provider(input_types=[integer_value_sequence(10), + integer_value(3)]) +def process_seq(settings, file_name): + for d in data: + seq = [] +``` +- 单层序列:有两句,分别为[1,3,2,4,5,2]和[0,2,2,5,0,1,2]。 +- 双层序列:有两句,分别为[[1,3,2],[4,5,2]](2个子句)和[[0,2],[2,5],[0,1,2]](3个子句)。 +- 单双层序列的label都分别是0和1 + +### 模型中的配置 + +我们选取单双层序列配置中的不同部分,来对比分析两者语义相同的原因。 + +- 单层序列:过了一个很简单的recurrent_group。每一个时间步,当前的输入y和上一个时间步的输出rnn_state做了一个全链接。 + +```python +def step(y): + mem = memory(name="rnn_state", size=hidden_dim) + return fc_layer(input=[y, mem], + size=hidden_dim, + act=TanhActivation(), + bias_attr=True, + name="rnn_state") + +out = recurrent_group(step=step, input=emb) +``` +- 双层序列,外层memory是一个元素: + - 内层inner_step的recurrent_group和单层序列的几乎一样。除了boot_layer=outer_mem,表示将外层的outer_mem作为内层memory的初始状态。外层outer_step中,outer_mem是一个子句的最后一个向量,即整个双层group是将前一个子句的最后一个向量,作为下一个子句memory的初始状态。 + - 从输入数据上看,单双层序列的句子是一样的,只是双层序列将其又做了子序列划分。因此双层序列的配置中,必须将前一个子句的最后一个元素,作为boot_layer传给下一个子句的memory,才能保证和单层序列的配置中“每一个时间步都用了上一个时间步的输出结果”一致。 + +```python +def outer_step(x): + outer_mem = memory(name="outer_rnn_state", size=hidden_dim) + def inner_step(y): + inner_mem = memory(name="inner_rnn_state", + size=hidden_dim, + boot_layer=outer_mem) + return fc_layer(input=[y, inner_mem], + size=hidden_dim, + act=TanhActivation(), + bias_attr=True, + name="inner_rnn_state") + + inner_rnn_output = recurrent_group( + step=inner_step, + input=x) + last = last_seq(input=inner_rnn_output, name="outer_rnn_state") + + return inner_rnn_output + +out = recurrent_group(step=outer_step, input=SubsequenceInput(emb)) +``` +- 双层序列,外层memory是单层序列: + - 由于外层每个时间步返回的是一个子句,这些子句的长度往往不等长。因此当外层有is_seq=True的memory时,内层是**无法直接使用**它的,即内层memory的boot_layer不能链接外层的这个memory。 + - 如果内层memory想**间接使用**这个外层memory,只能通过`pooling_layer`、`last_seq`或`first_seq`这三个layer将它先变成一个元素。但这种情况下,外层memory必须有boot_layer,否则在第0个时间步时,由于外层memory没有任何seq信息,因此上述三个layer的前向会报出“**Check failed: input.sequenceStartPositions**”的错误。 + +## 示例3:双进双出,输入不等长 + +**输入不等长**是指recurrent_group的多个输入在各时刻的长度可以不相等, 但需要指定一个和输出长度一致的input,用targetInlink表示。参考配置:单层RNN(`sequence_rnn_multi_unequalength_inputs.conf`),双层RNN(`sequence_nest_rnn_multi_unequalength_inputs.conf`) + +### 读取双层序列的方法 + +我们看一下单双层序列的数据组织形式和dataprovider(见`rnn_data_provider.py`) +```python +data2 = [ + [[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]] ,0], + [[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]], 1], +] + +@provider(input_types=[integer_value_sub_sequence(10), + integer_value_sub_sequence(10), + integer_value(2)], + should_shuffle=False) +def process_unequalength_subseq(settings, file_name): #双层RNN的dataprovider + for d in data2: + yield d + + +@provider(input_types=[integer_value_sequence(10), + integer_value_sequence(10), + integer_value(2)], + should_shuffle=False) +def process_unequalength_seq(settings, file_name): #单层RNN的dataprovider + for d in data2: + words1=reduce(lambda x,y: x+y, d[0]) + words2=reduce(lambda x,y: x+y, d[1]) + yield words1, words2, d[2] +``` + +data2 中有两个样本,每个样本有两个特征, 记fea1, fea2。 + +- 单层序列:两个样本分别为[[1, 2, 4, 5, 2], [5, 4, 1, 3, 1]] 和 [[0, 2, 2, 5, 0, 1, 2], [1, 5, 4, 2, 3, 6, 1]] +- 双层序列:两个样本分别为 + - **样本1**:[[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]]]。fea1和fea2都分别有2个子句,fea1=[[1, 2], [4, 5, 2]], fea2=[[5, 4, 1], [3, 1]] + - **样本2**:[[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]]]。fea1和fea2都分别有3个子句, fea1=[[0, 2], [2, 5], [0, 1, 2]], fea2=[[1, 5], [4], [2, 3, 6, 1]]。
+ - **注意**:每个样本中,各特征的子句数目需要相等。这里说的“双进双出,输入不等长”是指fea1在i时刻的输入的长度可以不等于fea2在i时刻的输入的长度。如对于第1个样本,时刻i=2, fea1[2]=[4, 5, 2],fea2[2]=[3, 1],3≠2。 +- 单双层序列中,两个样本的label都分别是0和1 + +### 模型中的配置 + +单层RNN(`sequence_rnn_multi_unequalength_inputs.conf`)和双层RNN(`sequence_nest_rnn_multi_unequalength_inputs.conf`)两个模型配置达到的效果完全一样,区别只在于输入为单层还是双层序列,现在我们来看它们内部分别是如何实现的。 + +- 单层序列: + - 过了一个简单的recurrent_group。每一个时间步,当前的输入y和上一个时间步的输出rnn_state做了一个全连接,功能与示例2中`sequence_rnn.conf`的`step`函数完全相同。这里,两个输入x1,x2分别通过calrnn返回最后时刻的状态。结果得到的encoder1_rep和encoder2_rep分别是单层序列,最后取encoder1_rep的最后一个时刻和encoder2_rep的所有时刻分别相加得到context。 + - 注意到这里recurrent_group输入的每个样本中,fea1和fea2的长度都分别相等,这并非偶然,而是因为recurrent_group要求输入为单层序列时,所有输入的长度都必须相等。 + +```python +def step(x1, x2): + def calrnn(y): + mem = memory(name = 'rnn_state_' + y.name, size = hidden_dim) + out = fc_layer(input = [y, mem], + size = hidden_dim, + act = TanhActivation(), + bias_attr = True, + name = 'rnn_state_' + y.name) + return out + + encoder1 = calrnn(x1) + encoder2 = calrnn(x2) + return [encoder1, encoder2] + +encoder1_rep, encoder2_rep = recurrent_group( + name="stepout", + step=step, + input=[emb1, emb2]) + +encoder1_last = last_seq(input = encoder1_rep) +encoder1_expandlast = expand_layer(input = encoder1_last, + expand_as = encoder2_rep) +context = mixed_layer(input = [identity_projection(encoder1_expandlast), + identity_projection(encoder2_rep)], + size = hidden_dim) +``` +- 双层序列: + - 双层RNN中,对输入的两个特征分别求时序上的连续全连接(`inner_step1`和`inner_step2`分别处理fea1和fea2),其功能与示例2中`sequence_nest_rnn.conf`的`outer_step`函数完全相同。不同之处是,此时输入`[SubsequenceInput(emb1), SubsequenceInput(emb2)]`在各时刻并不等长。 + - 函数`outer_step`中可以分别处理这两个特征,但我们需要用targetInlink指定recurrent_group的输出的格式(各子句长度)只能和其中一个保持一致,如这里选择了和emb2的长度一致。 + - 最后,依然是取encoder1_rep的最后一个时刻和encoder2_rep的所有时刻分别相加得到context。 + +```python +def outer_step(x1, x2): + outer_mem1 = memory(name = "outer_rnn_state1", size = hidden_dim) + outer_mem2 = memory(name = "outer_rnn_state2", size = hidden_dim) + def inner_step1(y): + inner_mem = memory(name = 'inner_rnn_state_' + y.name, + size = hidden_dim, + boot_layer = outer_mem1) + out = fc_layer(input = [y, inner_mem], + size = hidden_dim, + act = TanhActivation(), + bias_attr = True, + name = 'inner_rnn_state_' + y.name) + return out + + def inner_step2(y): + inner_mem = memory(name = 'inner_rnn_state_' + y.name, + size = hidden_dim, + boot_layer = outer_mem2) + out = fc_layer(input = [y, inner_mem], + size = hidden_dim, + act = TanhActivation(), + bias_attr = True, + name = 'inner_rnn_state_' + y.name) + return out + + encoder1 = recurrent_group( + step = inner_step1, + name = 'inner1', + input = x1) + + encoder2 = recurrent_group( + step = inner_step2, + name = 'inner2', + input = x2) + + sentence_last_state1 = last_seq(input = encoder1, name = 'outer_rnn_state1') + sentence_last_state2_ = last_seq(input = encoder2, name = 'outer_rnn_state2') + + encoder1_expand = expand_layer(input = sentence_last_state1, + expand_as = encoder2) + + return [encoder1_expand, encoder2] + +encoder1_rep, encoder2_rep = recurrent_group( + name="outer", + step=outer_step, + input=[SubsequenceInput(emb1), SubsequenceInput(emb2)], + targetInlink=emb2) + +encoder1_last = last_seq(input = encoder1_rep) +encoder1_expandlast = expand_layer(input = encoder1_last, + expand_as = encoder2_rep) +context = mixed_layer(input = [identity_projection(encoder1_expandlast), + identity_projection(encoder2_rep)], + size = hidden_dim) +``` + +## 示例4:beam_search的生成 + +TBD diff --git a/doc_cn/algorithm/rnn/rnn-tutorial.md b/doc_cn/algorithm/rnn/rnn-tutorial.md index 7a553054c80392946ba5b16cc31bcaea18cfc977..9e488b0d51956e86f9fb76f450fdb438f596e239 100644 --- a/doc_cn/algorithm/rnn/rnn-tutorial.md +++ b/doc_cn/algorithm/rnn/rnn-tutorial.md @@ -93,4 +93,4 @@ memory只能在`recurrent_group`中定义和使用。memory不能独立存在, 使用`beam_search`需要遵循以下约定: - 单层RNN:从一个word生成下一个word。 -- 双层RNN:即把单层RNN生成后的subseq给拼接成一个新的双层seq。从语义上看,也不存在一个subseq直接生成下一个subseq的情况。 \ No newline at end of file +- 双层RNN:即把单层RNN生成后的subseq给拼接成一个新的双层seq。从语义上看,也不存在一个subseq直接生成下一个subseq的情况。 diff --git a/doc_cn/build_and_install/install/paddle_version.txt b/doc_cn/build_and_install/install/paddle_version.txt index 7b2bfd2b1b3a9850665d118e424fd0cf6c24a062..a80873303fd0d05d963482629000d76260185ef6 100644 --- a/doc_cn/build_and_install/install/paddle_version.txt +++ b/doc_cn/build_and_install/install/paddle_version.txt @@ -8,4 +8,4 @@ PaddlePaddle 0.8.0b1, compiled with with_gflags: ON with_metric_learning: with_timer: OFF - with_predict_sdk: \ No newline at end of file + with_predict_sdk: diff --git a/doc_cn/concepts/use_concepts.rst b/doc_cn/concepts/use_concepts.rst index 9952c2df7fb041045d9121d65077734bac7a308b..67e98edabc0c2a4ecdf8d7993f8dd66b9365a05d 100644 --- a/doc_cn/concepts/use_concepts.rst +++ b/doc_cn/concepts/use_concepts.rst @@ -4,7 +4,7 @@ PaddlePaddle 基本使用概念 PaddlePaddle是一个神经网络学习框架。其单机进程为 :code:`paddle train`。 单机的所有设备使用,均在单机进程内调度完成。 而多机辅助进程 :code:`paddle pserver` 负责联合多个单机进程进行通信,进而充分利用集群的计算资源。 PaddlePaddle同时以 :code:`swig api` 的形式,提供训练结果模型预测的方法和自定义训练流程。 -下面我们会分别介绍主要进程 :code:`paddle train` 中的一些概念。这些概念会对如何使用PaddlePaddle有一定的帮助。 了解这些概念的前提是,读者已经了解 `基本的神经网络/机器学习原理和概念 `_ 。同时,如果想要了解PaddlePaddle实现中的一些概念,请参考 `PaddlePaddle 编程中的基本概念 `_ 。 +下面我们会分别介绍主要进程 :code:`paddle train` 中的一些概念。这些概念会对如何使用PaddlePaddle有一定的帮助。 了解这些概念的前提是,读者已经了解 `基本的神经网络/机器学习原理和概念 `_ 。同时,如果想要了解PaddlePaddle实现中的一些概念,请参考 `PaddlePaddle 编程中的基本概念 `_ 。 .. contents:: @@ -184,8 +184,8 @@ PaddlePaddle多机使用的经典方法是通过 :code:`Parameter Server` 来对 详细的说明可以参考,使用 `集群训练Paddle`_ 。 -.. _PyDataProvider: ../ui/data_provider/pydataprovider2.rst -.. _settings: ../../doc/ui/api/trainer_config_helpers/optimizers.rst -.. _mixed_layer: ../../doc/ui/api/trainer_config_helpers/layers.rst +.. _PyDataProvider: ../ui/data_provider/pydataprovider2.html +.. _settings: ../../doc/ui/api/trainer_config_helpers/optimizers.html#settings +.. _mixed_layer: ../../doc/ui/api/trainer_config_helpers/layers.html#mixed-layer .. _masking-gpu: http://www.acceleware.com/blog/cudavisibledevices-masking-gpus -.. _集群训练Paddle: ../cluster/index.rst +.. _集群训练Paddle: ../cluster/index.html diff --git a/doc_cn/faq/reduce_min_pool_size.py b/doc_cn/faq/reduce_min_pool_size.py index 2811b134b66b1ec55903d89e3f38a0cef8c9ef8d..5715397cc11e18246b8522fcc5b4f05780c9a0a7 100644 --- a/doc_cn/faq/reduce_min_pool_size.py +++ b/doc_cn/faq/reduce_min_pool_size.py @@ -3,4 +3,4 @@ def process(settings, filename): os.system('shuf %s > %s.shuf' % (filename, filename)) # shuffle before. with open('%s.shuf' % filename, 'r') as f: for line in f: - yield get_sample_from_line(line) \ No newline at end of file + yield get_sample_from_line(line) diff --git a/paddle/.common_test_util.sh b/paddle/.common_test_util.sh index dec22e45619fb5d393be96e929a7e301bf266224..dc1525061590808e3cc9c7b606aca5d5d9195a3a 100644 --- a/paddle/.common_test_util.sh +++ b/paddle/.common_test_util.sh @@ -117,4 +117,4 @@ set_port() fi done -} \ No newline at end of file +} diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index cae0f64400a7e618bffb4f7fc6a044011baf04d4..fb3af8ea92feed96a9669bfb29ef7353a256c308 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -17,5 +17,3 @@ endif() if(WITH_SWIG_PY) add_subdirectory(api) endif() - - diff --git a/paddle/api/PaddleAPIPrivate.h b/paddle/api/PaddleAPIPrivate.h index 93cdca8c4beaaad70a40e5899ccd908594425f4f..5ffeff6a9726c7445db36c7c1bec7c74825884a0 100644 --- a/paddle/api/PaddleAPIPrivate.h +++ b/paddle/api/PaddleAPIPrivate.h @@ -65,4 +65,3 @@ struct ArgumentsPrivate { return *(std::shared_ptr*)(rawPtr); } }; - diff --git a/paddle/api/test/CMakeLists.txt b/paddle/api/test/CMakeLists.txt index c4c26e6c03fdff51696f75f4d6a522cff60e7cca..08a0fe96a004d38b81d0bac881da1faeb52685f4 100644 --- a/paddle/api/test/CMakeLists.txt +++ b/paddle/api/test/CMakeLists.txt @@ -1,2 +1,2 @@ add_test(NAME test_swig_api - COMMAND bash ${PROJ_ROOT}/paddle/api/test/run_tests.sh) \ No newline at end of file + COMMAND bash ${PROJ_ROOT}/paddle/api/test/run_tests.sh) diff --git a/paddle/api/test/testMatrix.py b/paddle/api/test/testMatrix.py index 11035a9281656c49b6d1757dbac2f7f58cb7d8c8..2160612888b0f7ed6b504e5fc5933dfb3781f167 100644 --- a/paddle/api/test/testMatrix.py +++ b/paddle/api/test/testMatrix.py @@ -69,8 +69,8 @@ class TestMatrix(unittest.TestCase): def test_numpy(self): numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32") m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat) - self.assertEqual( - (int(m.getHeight()), int(m.getWidth())), numpy_mat.shape) + self.assertEqual((int(m.getHeight()), int(m.getWidth())), + numpy_mat.shape) # the numpy matrix and paddle matrix shared the same memory. numpy_mat[0, 1] = 342.23 diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h index 02fa6bc3ace32ff5ffe51dcce9c49757a990a9b2..9f80898a1f927a0e8bbf86108567a04ccecc38f5 100644 --- a/paddle/cuda/include/hl_base.h +++ b/paddle/cuda/include/hl_base.h @@ -254,4 +254,3 @@ extern __thread cudaStream_t default_stream; #endif /* __NVCC__ */ #endif /* HL_BASE_H_ */ - diff --git a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h index 34c173908246e4a48c327c8aa58730756bbc72b7..b96804afd86ba5e8c7b7eed7eb768295b4e23096 100644 --- a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h +++ b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h @@ -199,4 +199,3 @@ inline void hl_batch_norm_backward(hl_tensor_descriptor inputDesc, real *savedInvVar) {} #endif // HL_CUDA_CUDNN_STUB_H_ - diff --git a/paddle/cuda/src/avx_mathfun.h b/paddle/cuda/src/avx_mathfun.h index 808c2508d1a1a09fb25f052047d6b0539cad8df2..2922d4dc2937662d66fb2433f4883448ba21fa3f 100644 --- a/paddle/cuda/src/avx_mathfun.h +++ b/paddle/cuda/src/avx_mathfun.h @@ -718,4 +718,3 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { *s = _mm256_xor_ps(xmm1, sign_bit_sin); *c = _mm256_xor_ps(xmm2, sign_bit_cos); } - diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp index 2f85dd3c3b69d21cffede49b001298c6629900a6..3c2df52fed4f86675ce8f1ead6a3b66e4babde34 100644 --- a/paddle/gserver/layers/CostLayer.cpp +++ b/paddle/gserver/layers/CostLayer.cpp @@ -605,7 +605,7 @@ public: int batchSize = input->getHeight(); int size = 1; resizeOutput(batchSize, size); - output_.value->sumRows(*input); + output_.value->sumRows(*input, /* scaleSum= */1, /* scaleDest= */0); } virtual void backward(const UpdateCallback& callback = nullptr) { diff --git a/paddle/gserver/layers/FullMatrixProjection.cpp b/paddle/gserver/layers/FullMatrixProjection.cpp index 8241cbd37ec623622f19ff2ba35c21a4e3e3533a..f17c1b05bd892c7d933e4910887f977ac5cda79b 100644 --- a/paddle/gserver/layers/FullMatrixProjection.cpp +++ b/paddle/gserver/layers/FullMatrixProjection.cpp @@ -52,7 +52,9 @@ void FullMatrixProjection::backward(const UpdateCallback& callback) { } hl_set_sync_flag(syncFlag); - parameter_->incUpdate(callback); + if (weight_->getWGrad()) { + parameter_->incUpdate(callback); + } } } // namespace paddle diff --git a/paddle/gserver/layers/FullyConnectedLayer.h b/paddle/gserver/layers/FullyConnectedLayer.h index 24b6c547e7bc8a60d9374a55074416ea1b9bbc72..334eb4b722f4ff9a794a3818a1cf3087da27692f 100644 --- a/paddle/gserver/layers/FullyConnectedLayer.h +++ b/paddle/gserver/layers/FullyConnectedLayer.h @@ -48,4 +48,3 @@ public: }; } // namespace paddle - diff --git a/paddle/gserver/layers/ScalingProjection.cpp b/paddle/gserver/layers/ScalingProjection.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c0a7072c6a7cc1d37723f43d1068483779f56437 --- /dev/null +++ b/paddle/gserver/layers/ScalingProjection.cpp @@ -0,0 +1,53 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Projection.h" + +namespace paddle { + +class ScalingProjection : public Projection { +public: + ScalingProjection(const ProjectionConfig& config, + const ParameterPtr& parameter, bool useGpu) + : Projection(config, parameter, useGpu) { + CHECK_EQ(parameter->getSize(), 1UL); + weight_.reset(new Weight(1, 1, parameter)); + } + + void forward() { + CHECK(in_->value); + out_->value->add(*in_->value, weight_->getW()->getElement(0, 0)); + } + + void backward(const UpdateCallback& callback) { + if (weight_->getWGrad()) { + auto sum = Matrix::create(in_->value->getHeight(), 1, false, useGpu_); + sum->sumOfProducts(*in_->value, *out_->grad, + /* scaleSum= */1, /* scaleDest= */0); + weight_->getWGrad()->sumCols(*sum, + /* scaleSum= */1, /* scaleDest= */1); + parameter_->incUpdate(callback); + } + if (in_->grad) { + in_->grad->add(*out_->grad, weight_->getW()->getElement(0, 0)); + } + } + +protected: + std::unique_ptr weight_; +}; + +REGISTER_PROJECTION(scaling, ScalingProjection); + +} // namespace paddle diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index f3cd2b4faf0c173cbb4997aac1a00ebba3027c92..a79dfe39c9bb26c7b2acec1051699e1804494d93 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -135,6 +135,17 @@ TEST(Projection, identity) { } } +TEST(Projection, scaling) { + ProjectionConfig conf; + conf.set_type("scaling"); + conf.set_input_size(10); + conf.set_output_size(10); + for (auto useGpu : {false}) { + testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 1, + /* batchSize */ 100, useGpu); + } +} + #ifndef PADDLE_ONLY_CPU TEST(Projection, conv) { const int NUM_FILTERS = 16; diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu index d81b99e5441584b21fb023dcae65ccec7dd27996..54448bdb5a9bb4f665f28f973eada30a07fb5eee 100644 --- a/paddle/math/BaseMatrix.cu +++ b/paddle/math/BaseMatrix.cu @@ -1451,6 +1451,8 @@ int BaseMatrixT::applyRow(Agg agg, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); int numRows = b.height_; int numCols = b.width_; + CHECK_EQ(height_, numRows); + CHECK_EQ(width_, 1UL); aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows, numCols, offset, false_type(), true_type() /*aAsColVector*/); @@ -1463,18 +1465,69 @@ int BaseMatrixT::applyRow(Agg agg, Saver sv, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); int numRows = b.height_; int numCols = b.width_; + CHECK_EQ(height_, numRows); + CHECK_EQ(width_, 1UL); aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset, false_type(), true_type() /*aAsColVector*/); return 0; } +template<> +template +int BaseMatrixT::applyRow( + Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) { + if (scaleDest != 0) { + applyRow(agg, base::binary::add2(scaleDest, scaleAgg), b); + } else { + applyRow(agg, base::binary::second(), b); + if (scaleAgg != 1) { + mulScalar(scaleAgg); + } + } + return 0; +} + +template<> +template +int BaseMatrixT::applyRow(Agg agg, Op op, Saver sv, + BaseMatrixT& b, BaseMatrixT& c) { + MatrixOffset offset(0, 0, 0, 0, 0, 0); + int numRows = b.height_; + int numCols = b.width_; + CHECK_EQ(height_, numRows); + CHECK_EQ(width_, 1UL); + CHECK_EQ(c.height_, numRows); + CHECK_EQ(c.width_, numCols); + aggregate(agg, op, sv, + b, c, numRows, numCols, offset, + false_type(), true_type() /*aAsColVector*/); + return 0; +} + +template<> +template +int BaseMatrixT::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg, + BaseMatrixT& b, BaseMatrixT& c) { + if (scaleDest != 0) { + applyRow(agg, op, base::binary::add2(scaleDest, scaleAgg), b, c); + } else { + applyRow(agg, op, base::binary::second(), b, c); + if (scaleAgg != 1) { + mulScalar(scaleAgg); + } + } + return 0; +} + template<> template int BaseMatrixT::applyCol(Agg agg, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); int numRows = b.height_; int numCols = b.width_; + CHECK_EQ(width_, numCols); + CHECK_EQ(height_, 1UL); aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows, numCols, offset, true_type() /*aAsRowVector*/, false_type()); @@ -1487,6 +1540,8 @@ int BaseMatrixT::applyCol(Agg agg, Saver sv, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); int numRows = b.height_; int numCols = b.width_; + CHECK_EQ(width_, numCols); + CHECK_EQ(height_, 1UL); aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset, true_type() /*aAsRowVector*/, false_type()); @@ -1494,8 +1549,23 @@ int BaseMatrixT::applyCol(Agg agg, Saver sv, BaseMatrixT& b) { } template<> -void BaseMatrixT::sumRows(BaseMatrixT& b) { - applyRow(aggregate::sum(), b); +template +int BaseMatrixT::applyCol( + Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) { + if (scaleDest != 0) { + applyCol(agg, base::binary::add2(scaleDest, scaleAgg), b); + } else { + applyCol(agg, base::binary::second(), b); + if (scaleAgg != 1) { + mulScalar(scaleAgg); + } + } + return 0; +} + +template<> +void BaseMatrixT::sumRows(BaseMatrixT& b, real scaleSum, real scaleDest) { + applyRow(aggregate::sum(), scaleDest, scaleSum, b); } template<> @@ -1524,18 +1594,22 @@ void BaseMatrixT::minCols(BaseMatrixT& b) { } template<> -void BaseMatrixT::sumCols(BaseMatrixT& b, real scale) { - applyCol(aggregate::sum(), base::binary::add2(1.0, scale), b); +void BaseMatrixT::sumCols(BaseMatrixT& b, real scaleSum, real scaleDest) { + applyCol(aggregate::sum(), scaleDest, scaleSum, b); } template<> -void BaseMatrixT::sumOfSquares(BaseMatrixT& b, BaseMatrixT& c) { - int numRows = b.height_; - int numCols = b.width_; - MatrixOffset offset(0, 0, 0, 0, 0, 0); - aggregate(aggregate::sum(), base::binary::squaredDiff(), base::binary::add(), - b, c, numRows, numCols, offset, false_type(), - true_type() /*aAsColVector*/); +void BaseMatrixT::sumOfSquaredDiffs( + BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) { + applyRow(aggregate::sum(), base::binary::squaredDiff(), + scaleDest, scaleSum, b, c); +} + +template<> +void BaseMatrixT::sumOfProducts( + BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) { + applyRow(aggregate::sum(), base::binary::mul(), + scaleDest, scaleSum, b, c); } template class BaseMatrixT; diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h index 2dd2c2c7a9b985924d53cb3bf8840eb1e55eee3e..3a91fdc3c30c5332866a97c256b018eb0982260f 100644 --- a/paddle/math/BaseMatrix.h +++ b/paddle/math/BaseMatrix.h @@ -305,6 +305,23 @@ public: template int applyRow(Agg agg, BaseMatrixT& b); + /** + * a aggregate expression that apply each row of matrix b. + * + * @code + * for each row i & 0 <= j < b.width_, do: + * dst = agg(op(b[i*ldb + j], c[i*ldc + j]) + * this[i] = sv(this[i], dst) + * @endcode + */ + template + int applyRow(Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c); + + // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg) + template + int applyRow(Agg agg, Op op, real scaleDest, real scaleAgg, + BaseMatrixT& b, BaseMatrixT& c); + /** * a aggregate expression that apply each row of matrix b. * @@ -317,6 +334,10 @@ public: template int applyRow(Agg agg, Saver sv, BaseMatrixT& b); + // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg) + template + int applyRow(Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b); + /** * a aggregate expression that apply each column of matrix b. * @@ -340,6 +361,10 @@ public: template int applyCol(Agg agg, Saver sv, BaseMatrixT& b); + // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg) + template + int applyCol(Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b); + bool useGpu() const { return useGpu_; } const T* rowBuf(size_t row) const { return data_ + width_ * row; } @@ -920,7 +945,9 @@ public: void addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c); /// calculate the sum of each row of the matrix b. - void sumRows(BaseMatrixT& b); + /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} + void sumRows(BaseMatrixT& b, T scaleSum, T scaleDest); + /// calculate the maximum value of each row of the matrix b. void maxRows(BaseMatrixT& b); /// calculate the minimum value of each row of the matrix b. @@ -932,10 +959,18 @@ public: void maxCols(BaseMatrixT& b); /// calculate the minimum value of each column of the matrix b. void minCols(BaseMatrixT& b); - void sumCols(BaseMatrixT& b, T scale); - /// calculate the sum of each row of (b - c)^2. - void sumOfSquares(BaseMatrixT& b, BaseMatrixT& c); + /// calculate the sum of each column of the matrix b. + /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ji} + void sumCols(BaseMatrixT& b, T scaleSum, T scaleDest); + + /// this_i = scaleDest * this_i + scaleSum * \sum_j (b_{ij} - c_{ij})^2 + void sumOfSquaredDiffs(BaseMatrixT& b, BaseMatrixT& c, + T scaleSum, T scaleDest); + + /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij} + void sumOfProducts(BaseMatrixT& b, BaseMatrixT& c, + T scaleSum, T scaleDest); /** * @code diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h index b322bd2bd719484b86b62bca5783d78bd8ca9a4c..29c07467c7bac9c382f02a5f6ffdcfd87c5b09a0 100644 --- a/paddle/math/MathFunctions.h +++ b/paddle/math/MathFunctions.h @@ -80,4 +80,3 @@ void vTanh(const int n, const T* a, T* r); } // namespace paddle #endif // MATHFUNCTIONS_H_ - diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index 5ee8fbebfcfbe9696f7836b6b1c88e724551da8e..706a598d0c33762b0578190ea4a0aa06247a88ef 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -242,7 +242,7 @@ real GpuMatrix::getSum() { void GpuMatrix::accumulateColSum(Matrix& src) { CHECK_EQ(getWidth(), src.getWidth()); CHECK_EQ(getHeight(), (size_t)1); - sumCols(src, 1.0); + sumCols(src, 1.0, 1.0); } real GpuMatrix::getAbsSum() { @@ -389,7 +389,7 @@ void GpuMatrix::collectBias(Matrix& a, real scale) { CHECK_EQ(width_, a.getWidth()); GpuSparseMatrix* sMatPtr = dynamic_cast(&a); if (!sMatPtr) { - sumCols(a, scale); + sumCols(a, /* scaleSum= */scale, /* scaleDest= */1); } else { real* data = getData(); hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get(); @@ -589,7 +589,7 @@ void GpuMatrix::addToRows(Matrix& table, IVector& ids) { void GpuMatrix::colMerge(Matrix& src) { CHECK(src.height_ == height_); if (!trans_ && !src.trans_) { - sumRows(src); + sumRows(src, /* scaleSum= */1, /* scaleDest= */0); } else { LOG(FATAL) << "Is not supported"; } @@ -599,7 +599,7 @@ void GpuMatrix::rowSum(Matrix& sum) { CHECK_EQ(sum.getHeight(), getHeight()); CHECK_EQ(sum.getWidth(), (size_t)1); - sum.sumRows(*this); + sum.sumRows(*this, /* scaleSum= */1, /* scaleDest= */0); } void GpuMatrix::rowMax(Matrix& max) { @@ -790,7 +790,8 @@ void GpuMatrix::sumOfSquares(Matrix& output, Matrix& label) { LOG(FATAL) << "not supported: GpuSparseMatrix as label"; } - BaseMatrix::sumOfSquares(output, label); + BaseMatrix::sumOfSquaredDiffs(output, label, + /* scaleSum= */1, /* scaleDest= */1); } void GpuMatrix::sumOfSquaresBp(Matrix& outputV, Matrix& label) { @@ -1501,7 +1502,7 @@ void CpuMatrix::accumulateColSum(Matrix& src) { CHECK_EQ(getWidth(), src.getWidth()); CHECK_EQ(getHeight(), (size_t)1); - sumCols(src, 1.0); + sumCols(src, /* scaleSum= */1, /* scaleDest= */1); } real CpuMatrix::getAbsSum() { @@ -2188,7 +2189,7 @@ void CpuMatrix::collectBias(Matrix& a, real scale) { CHECK_EQ(width_, a.getWidth()); CpuSparseMatrix* aptr = dynamic_cast(&a); if (!aptr) { - sumCols(a, scale); + sumCols(a, /* scaleSum= */scale, /* scaleDest= */1); } else { size_t nnz = aptr->getElementCnt(); int* cols = aptr->getCols(); @@ -2227,7 +2228,7 @@ void CpuMatrix::sequenceAvgForward(Matrix& a, real* dst = getData(); real* src = a.getData(); const int* starts = startsPos.getData(); - MatrixPtr outMtx = Matrix::create(1, 1, false, false); + MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false); MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false); for (size_t i = 0; i < height; i++) { int sequenceLength = starts[i + 1] - starts[i]; @@ -2239,13 +2240,15 @@ void CpuMatrix::sequenceAvgForward(Matrix& a, dataMtx->setData(src + starts[i] * width, sequenceLength, width); if (mode == 0) { // plain average - outMtx->sumCols(*dataMtx, (real)1 / (real)sequenceLength); + outMtx->sumCols(*dataMtx, (real)1 / (real)sequenceLength, + /* scaleDest= */1); } else if (mode == 1) { // sum instead of average - outMtx->sumCols(*dataMtx, (real)1); + outMtx->sumCols(*dataMtx, /* scaleSum= */1, /* scaleDest= */1); } else if (mode == 2) { // divide by square root of sequenceLength - outMtx->sumCols(*dataMtx, (real)1 / std::sqrt(sequenceLength)); + outMtx->sumCols(*dataMtx, (real)1 / std::sqrt(sequenceLength), + /* scaleDest= */1); } else { LOG(FATAL) << "should not reach here"; } @@ -2932,7 +2935,7 @@ void CpuMatrix::rowSum(Matrix& sum) { CHECK_EQ(sum.getHeight(), getHeight()); CHECK_EQ(sum.getWidth(), (size_t)1); - sum.sumRows(*this); + sum.sumRows(*this, /* scaleSum= */1, /* scaleDest= */0); } void CpuMatrix::rowMaxId(IVector& maxIds) { @@ -3485,7 +3488,8 @@ void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) { } } - BaseMatrix::sumOfSquares(output, label); + BaseMatrix::sumOfSquaredDiffs(output, label, + /* scaleSum= */1, /* scaleDest= */1); } /* calculate the error of outputV according to label */ diff --git a/paddle/parameter/CMakeLists.txt b/paddle/parameter/CMakeLists.txt index d6f67604c03485dfeb3c907705b117ac550e9b6f..a35e46997fb04e9378e106bf428a629b286c2e8c 100644 --- a/paddle/parameter/CMakeLists.txt +++ b/paddle/parameter/CMakeLists.txt @@ -10,4 +10,4 @@ add_style_check_target(paddle_parameter ${PARAMETERS_HEADERS}) add_dependencies(paddle_parameter gen_proto_cpp) if(WITH_TESTING) add_subdirectory(tests) -endif() \ No newline at end of file +endif() diff --git a/paddle/parameter/tests/CMakeLists.txt b/paddle/parameter/tests/CMakeLists.txt index 177fb2fdfc045e1c68cc56e9f7654cbda5f46e25..cab264db8e5000e8eb61830ec07e9f590c103119 100644 --- a/paddle/parameter/tests/CMakeLists.txt +++ b/paddle/parameter/tests/CMakeLists.txt @@ -1 +1 @@ -add_simple_unittest(test_common) \ No newline at end of file +add_simple_unittest(test_common) diff --git a/paddle/scripts/CMakeLists.txt b/paddle/scripts/CMakeLists.txt index dee46055c5a4db8c77b4248de70573c02a60e631..1bae396a18688cd53e164774df07660ccc2451d7 100644 --- a/paddle/scripts/CMakeLists.txt +++ b/paddle/scripts/CMakeLists.txt @@ -6,4 +6,4 @@ configure_file(submit_local.sh.in install(FILES ${CMAKE_CURRENT_BINARY_DIR}/submit_local.sh DESTINATION bin PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ - RENAME paddle) \ No newline at end of file + RENAME paddle) diff --git a/paddle/scripts/cpplint.py b/paddle/scripts/cpplint.py index 5e905b865fc5167a2bddcf4ca0ab8313d17af4a3..157ce7b44ac3cfe3a8ca5eda78e959cf7be4cc5b 100644 --- a/paddle/scripts/cpplint.py +++ b/paddle/scripts/cpplint.py @@ -27,7 +27,6 @@ # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - """Does google-lint on c++ files. The goal of this script is to identify places in the code that *may* @@ -55,7 +54,6 @@ import string import sys import unicodedata - _USAGE = """ Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...] [--counting=total|toplevel|detailed] [--root=subdir] @@ -242,13 +240,11 @@ _ERROR_CATEGORIES = [ 'whitespace/semicolon', 'whitespace/tab', 'whitespace/todo', - ] +] # These error categories are no longer enforced by cpplint, but for backwards- # compatibility they may still appear in NOLINT comments. -_LEGACY_ERROR_CATEGORIES = [ - 'readability/streams', - ] +_LEGACY_ERROR_CATEGORIES = ['readability/streams', ] # The default state of the category filter. This is overridden by the --filter= # flag. By default all errors are on, so only add here categories that should be @@ -394,8 +390,7 @@ _CPP_HEADERS = frozenset([ 'cuchar', 'cwchar', 'cwctype', - ]) - +]) # These headers are excluded from [build/include] and [build/include_order] # checks: @@ -405,38 +400,40 @@ _CPP_HEADERS = frozenset([ _THIRD_PARTY_HEADERS_PATTERN = re.compile( r'^(?:[^/]*[A-Z][^/]*\.h|lua\.h|lauxlib\.h|lualib\.h)$') - # Assertion macros. These are defined in base/logging.h and # testing/base/gunit.h. Note that the _M versions need to come first # for substring matching to work. _CHECK_MACROS = [ - 'DCHECK', 'CHECK', - 'EXPECT_TRUE_M', 'EXPECT_TRUE', - 'ASSERT_TRUE_M', 'ASSERT_TRUE', - 'EXPECT_FALSE_M', 'EXPECT_FALSE', - 'ASSERT_FALSE_M', 'ASSERT_FALSE', - ] + 'DCHECK', + 'CHECK', + 'EXPECT_TRUE_M', + 'EXPECT_TRUE', + 'ASSERT_TRUE_M', + 'ASSERT_TRUE', + 'EXPECT_FALSE_M', + 'EXPECT_FALSE', + 'ASSERT_FALSE_M', + 'ASSERT_FALSE', +] # Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE _CHECK_REPLACEMENT = dict([(m, {}) for m in _CHECK_MACROS]) -for op, replacement in [('==', 'EQ'), ('!=', 'NE'), - ('>=', 'GE'), ('>', 'GT'), +for op, replacement in [('==', 'EQ'), ('!=', 'NE'), ('>=', 'GE'), ('>', 'GT'), ('<=', 'LE'), ('<', 'LT')]: - _CHECK_REPLACEMENT['DCHECK'][op] = 'DCHECK_%s' % replacement - _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement - _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement - _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement - _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement - _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement - -for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'), - ('>=', 'LT'), ('>', 'LE'), - ('<=', 'GT'), ('<', 'GE')]: - _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement - _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement - _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement - _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement + _CHECK_REPLACEMENT['DCHECK'][op] = 'DCHECK_%s' % replacement + _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement + _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement + _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement + _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement + _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement + +for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'), ('>=', 'LT'), + ('>', 'LE'), ('<=', 'GT'), ('<', 'GE')]: + _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement + _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement + _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement + _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement # Alternative tokens and their replacements. For full list, see section 2.5 # Alternative tokens [lex.digraph] in the C++ standard. @@ -455,16 +452,15 @@ _ALT_TOKEN_REPLACEMENT = { 'xor_eq': '^=', 'not': '!', 'not_eq': '!=' - } +} # Compile regular expression that matches all the above keywords. The "[ =()]" # bit is meant to avoid matching these keywords outside of boolean expressions. # # False positives include C-style multi-line comments and multi-line strings # but those have always been troublesome for cpplint. -_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile( - r'[ =()](' + ('|'.join(_ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)') - +_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile(r'[ =()](' + ('|'.join( + _ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)') # These constants define types of headers for use with # _IncludeState.CheckNextIncludeOrder(). @@ -475,17 +471,16 @@ _POSSIBLE_MY_HEADER = 4 _OTHER_HEADER = 5 # These constants define the current inline assembly state -_NO_ASM = 0 # Outside of inline assembly block -_INSIDE_ASM = 1 # Inside inline assembly block -_END_ASM = 2 # Last line of inline assembly block -_BLOCK_ASM = 3 # The whole block is an inline assembly block +_NO_ASM = 0 # Outside of inline assembly block +_INSIDE_ASM = 1 # Inside inline assembly block +_END_ASM = 2 # Last line of inline assembly block +_BLOCK_ASM = 3 # The whole block is an inline assembly block # Match start of assembly blocks _MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)' r'(?:\s+(volatile|__volatile__))?' r'\s*[{(]') - _regexp_compile_cache = {} # {str, set(int)}: a map from error categories to sets of linenumbers @@ -504,8 +499,9 @@ _line_length = 80 # This is set by --extensions flag. _valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh']) + def ParseNolintSuppressions(filename, raw_line, linenum, error): - """Updates the global list of error-suppressions. + """Updates the global list of error-suppressions. Parses any NOLINT comments on the current line, updating the global error_suppressions store. Reports an error if the NOLINT comment @@ -517,45 +513,47 @@ def ParseNolintSuppressions(filename, raw_line, linenum, error): linenum: int, the number of the current line. error: function, an error handler. """ - matched = Search(r'\bNOLINT(NEXTLINE(S_\d+)?)?\b(\([^)]+\))?', raw_line) - if matched: - if matched.group(1): - lines = matched.group(2) - if lines : - lines=int(lines[2:]) - suppressed_line = [ linenum + i for i in xrange(lines) ] - else: - suppressed_line = linenum + 1 - else: - suppressed_line = linenum - category = matched.group(3) - if category in (None, '(*)'): # => "suppress all" - if isinstance(suppressed_line, int): - _error_suppressions.setdefault(None, set()).add(suppressed_line) - else: - for _line in suppressed_line: - _error_suppressions.setdefault(None, set()).add(_line) - else: - if category.startswith('(') and category.endswith(')'): - category = category[1:-1] - if category in _ERROR_CATEGORIES: - if isinstance(suppressed_line, int): - _error_suppressions.setdefault(category, set()).add(suppressed_line) - else: - for _line in suppressed_line: - _error_suppressions.setdefault(category, set()).add(_line) - elif category not in _LEGACY_ERROR_CATEGORIES: - error(filename, linenum, 'readability/nolint', 5, - 'Unknown NOLINT error category: %s' % category) + matched = Search(r'\bNOLINT(NEXTLINE(S_\d+)?)?\b(\([^)]+\))?', raw_line) + if matched: + if matched.group(1): + lines = matched.group(2) + if lines: + lines = int(lines[2:]) + suppressed_line = [linenum + i for i in xrange(lines)] + else: + suppressed_line = linenum + 1 + else: + suppressed_line = linenum + category = matched.group(3) + if category in (None, '(*)'): # => "suppress all" + if isinstance(suppressed_line, int): + _error_suppressions.setdefault(None, set()).add(suppressed_line) + else: + for _line in suppressed_line: + _error_suppressions.setdefault(None, set()).add(_line) + else: + if category.startswith('(') and category.endswith(')'): + category = category[1:-1] + if category in _ERROR_CATEGORIES: + if isinstance(suppressed_line, int): + _error_suppressions.setdefault( + category, set()).add(suppressed_line) + else: + for _line in suppressed_line: + _error_suppressions.setdefault(category, + set()).add(_line) + elif category not in _LEGACY_ERROR_CATEGORIES: + error(filename, linenum, 'readability/nolint', 5, + 'Unknown NOLINT error category: %s' % category) def ResetNolintSuppressions(): - """Resets the set of NOLINT suppressions to empty.""" - _error_suppressions.clear() + """Resets the set of NOLINT suppressions to empty.""" + _error_suppressions.clear() def IsErrorSuppressedByNolint(category, linenum): - """Returns true if the specified error category is suppressed on this line. + """Returns true if the specified error category is suppressed on this line. Consults the global error_suppressions map populated by ParseNolintSuppressions/ResetNolintSuppressions. @@ -566,22 +564,22 @@ def IsErrorSuppressedByNolint(category, linenum): Returns: bool, True iff the error should be suppressed due to a NOLINT comment. """ - return (linenum in _error_suppressions.get(category, set()) or - linenum in _error_suppressions.get(None, set())) + return (linenum in _error_suppressions.get(category, set()) or + linenum in _error_suppressions.get(None, set())) def Match(pattern, s): - """Matches the string with the pattern, caching the compiled regexp.""" - # The regexp compilation caching is inlined in both Match and Search for - # performance reasons; factoring it out into a separate function turns out - # to be noticeably expensive. - if pattern not in _regexp_compile_cache: - _regexp_compile_cache[pattern] = sre_compile.compile(pattern) - return _regexp_compile_cache[pattern].match(s) + """Matches the string with the pattern, caching the compiled regexp.""" + # The regexp compilation caching is inlined in both Match and Search for + # performance reasons; factoring it out into a separate function turns out + # to be noticeably expensive. + if pattern not in _regexp_compile_cache: + _regexp_compile_cache[pattern] = sre_compile.compile(pattern) + return _regexp_compile_cache[pattern].match(s) def ReplaceAll(pattern, rep, s): - """Replaces instances of pattern in a string with a replacement. + """Replaces instances of pattern in a string with a replacement. The compiled regex is kept in a cache shared by Match and Search. @@ -593,20 +591,20 @@ def ReplaceAll(pattern, rep, s): Returns: string with replacements made (or original string if no replacements) """ - if pattern not in _regexp_compile_cache: - _regexp_compile_cache[pattern] = sre_compile.compile(pattern) - return _regexp_compile_cache[pattern].sub(rep, s) + if pattern not in _regexp_compile_cache: + _regexp_compile_cache[pattern] = sre_compile.compile(pattern) + return _regexp_compile_cache[pattern].sub(rep, s) def Search(pattern, s): - """Searches the string for the pattern, caching the compiled regexp.""" - if pattern not in _regexp_compile_cache: - _regexp_compile_cache[pattern] = sre_compile.compile(pattern) - return _regexp_compile_cache[pattern].search(s) + """Searches the string for the pattern, caching the compiled regexp.""" + if pattern not in _regexp_compile_cache: + _regexp_compile_cache[pattern] = sre_compile.compile(pattern) + return _regexp_compile_cache[pattern].search(s) class _IncludeState(object): - """Tracks line numbers for includes, and the order in which includes appear. + """Tracks line numbers for includes, and the order in which includes appear. include_list contains list of lists of (header, line number) pairs. It's a lists of lists rather than just one flat list to make it @@ -617,35 +615,35 @@ class _IncludeState(object): raise an _IncludeError with an appropriate error message. """ - # self._section will move monotonically through this set. If it ever - # needs to move backwards, CheckNextIncludeOrder will raise an error. - _INITIAL_SECTION = 0 - _MY_H_SECTION = 1 - _C_SECTION = 2 - _CPP_SECTION = 3 - _OTHER_H_SECTION = 4 - - _TYPE_NAMES = { - _C_SYS_HEADER: 'C system header', - _CPP_SYS_HEADER: 'C++ system header', - _LIKELY_MY_HEADER: 'header this file implements', - _POSSIBLE_MY_HEADER: 'header this file may implement', - _OTHER_HEADER: 'other header', - } - _SECTION_NAMES = { - _INITIAL_SECTION: "... nothing. (This can't be an error.)", - _MY_H_SECTION: 'a header this file implements', - _C_SECTION: 'C system header', - _CPP_SECTION: 'C++ system header', - _OTHER_H_SECTION: 'other header', - } - - def __init__(self): - self.include_list = [[]] - self.ResetSection('') - - def FindHeader(self, header): - """Check if a header has already been included. + # self._section will move monotonically through this set. If it ever + # needs to move backwards, CheckNextIncludeOrder will raise an error. + _INITIAL_SECTION = 0 + _MY_H_SECTION = 1 + _C_SECTION = 2 + _CPP_SECTION = 3 + _OTHER_H_SECTION = 4 + + _TYPE_NAMES = { + _C_SYS_HEADER: 'C system header', + _CPP_SYS_HEADER: 'C++ system header', + _LIKELY_MY_HEADER: 'header this file implements', + _POSSIBLE_MY_HEADER: 'header this file may implement', + _OTHER_HEADER: 'other header', + } + _SECTION_NAMES = { + _INITIAL_SECTION: "... nothing. (This can't be an error.)", + _MY_H_SECTION: 'a header this file implements', + _C_SECTION: 'C system header', + _CPP_SECTION: 'C++ system header', + _OTHER_H_SECTION: 'other header', + } + + def __init__(self): + self.include_list = [[]] + self.ResetSection('') + + def FindHeader(self, header): + """Check if a header has already been included. Args: header: header to check. @@ -653,35 +651,35 @@ class _IncludeState(object): Line number of previous occurrence, or -1 if the header has not been seen before. """ - for section_list in self.include_list: - for f in section_list: - if f[0] == header: - return f[1] - return -1 + for section_list in self.include_list: + for f in section_list: + if f[0] == header: + return f[1] + return -1 - def ResetSection(self, directive): - """Reset section checking for preprocessor directive. + def ResetSection(self, directive): + """Reset section checking for preprocessor directive. Args: directive: preprocessor directive (e.g. "if", "else"). """ - # The name of the current section. - self._section = self._INITIAL_SECTION - # The path of last found header. - self._last_header = '' + # The name of the current section. + self._section = self._INITIAL_SECTION + # The path of last found header. + self._last_header = '' - # Update list of includes. Note that we never pop from the - # include list. - if directive in ('if', 'ifdef', 'ifndef'): - self.include_list.append([]) - elif directive in ('else', 'elif'): - self.include_list[-1] = [] + # Update list of includes. Note that we never pop from the + # include list. + if directive in ('if', 'ifdef', 'ifndef'): + self.include_list.append([]) + elif directive in ('else', 'elif'): + self.include_list[-1] = [] - def SetLastHeader(self, header_path): - self._last_header = header_path + def SetLastHeader(self, header_path): + self._last_header = header_path - def CanonicalizeAlphabeticalOrder(self, header_path): - """Returns a path canonicalized for alphabetical comparison. + def CanonicalizeAlphabeticalOrder(self, header_path): + """Returns a path canonicalized for alphabetical comparison. - replaces "-" with "_" so they both cmp the same. - removes '-inl' since we don't require them to be after the main header. @@ -693,10 +691,10 @@ class _IncludeState(object): Returns: Canonicalized path. """ - return header_path.replace('-inl.h', '.h').replace('-', '_').lower() + return header_path.replace('-inl.h', '.h').replace('-', '_').lower() - def IsInAlphabeticalOrder(self, clean_lines, linenum, header_path): - """Check if a header is in alphabetical order with the previous header. + def IsInAlphabeticalOrder(self, clean_lines, linenum, header_path): + """Check if a header is in alphabetical order with the previous header. Args: clean_lines: A CleansedLines instance containing the file. @@ -706,18 +704,18 @@ class _IncludeState(object): Returns: Returns true if the header is in alphabetical order. """ - # If previous section is different from current section, _last_header will - # be reset to empty string, so it's always less than current header. - # - # If previous line was a blank line, assume that the headers are - # intentionally sorted the way they are. - if (self._last_header > header_path and - Match(r'^\s*#\s*include\b', clean_lines.elided[linenum - 1])): - return False - return True + # If previous section is different from current section, _last_header will + # be reset to empty string, so it's always less than current header. + # + # If previous line was a blank line, assume that the headers are + # intentionally sorted the way they are. + if (self._last_header > header_path and + Match(r'^\s*#\s*include\b', clean_lines.elided[linenum - 1])): + return False + return True - def CheckNextIncludeOrder(self, header_type): - """Returns a non-empty error message if the next header is out of order. + def CheckNextIncludeOrder(self, header_type): + """Returns a non-empty error message if the next header is out of order. This function also updates the internal state to be ready to check the next include. @@ -730,80 +728,79 @@ class _IncludeState(object): error message describing what's wrong. """ - error_message = ('Found %s after %s' % - (self._TYPE_NAMES[header_type], - self._SECTION_NAMES[self._section])) - - last_section = self._section - - if header_type == _C_SYS_HEADER: - if self._section <= self._C_SECTION: - self._section = self._C_SECTION - else: - self._last_header = '' - return error_message - elif header_type == _CPP_SYS_HEADER: - if self._section <= self._CPP_SECTION: - self._section = self._CPP_SECTION - else: - self._last_header = '' - return error_message - elif header_type == _LIKELY_MY_HEADER: - if self._section <= self._MY_H_SECTION: - self._section = self._MY_H_SECTION - else: - self._section = self._OTHER_H_SECTION - elif header_type == _POSSIBLE_MY_HEADER: - if self._section <= self._MY_H_SECTION: - self._section = self._MY_H_SECTION - else: - # This will always be the fallback because we're not sure - # enough that the header is associated with this file. - self._section = self._OTHER_H_SECTION - else: - assert header_type == _OTHER_HEADER - self._section = self._OTHER_H_SECTION + error_message = ('Found %s after %s' % ( + self._TYPE_NAMES[header_type], self._SECTION_NAMES[self._section])) + + last_section = self._section + + if header_type == _C_SYS_HEADER: + if self._section <= self._C_SECTION: + self._section = self._C_SECTION + else: + self._last_header = '' + return error_message + elif header_type == _CPP_SYS_HEADER: + if self._section <= self._CPP_SECTION: + self._section = self._CPP_SECTION + else: + self._last_header = '' + return error_message + elif header_type == _LIKELY_MY_HEADER: + if self._section <= self._MY_H_SECTION: + self._section = self._MY_H_SECTION + else: + self._section = self._OTHER_H_SECTION + elif header_type == _POSSIBLE_MY_HEADER: + if self._section <= self._MY_H_SECTION: + self._section = self._MY_H_SECTION + else: + # This will always be the fallback because we're not sure + # enough that the header is associated with this file. + self._section = self._OTHER_H_SECTION + else: + assert header_type == _OTHER_HEADER + self._section = self._OTHER_H_SECTION - if last_section != self._section: - self._last_header = '' + if last_section != self._section: + self._last_header = '' - return '' + return '' class _CppLintState(object): - """Maintains module-wide state..""" - - def __init__(self): - self.verbose_level = 1 # global setting. - self.error_count = 0 # global count of reported errors - # filters to apply when emitting error messages - self.filters = _DEFAULT_FILTERS[:] - # backup of filter list. Used to restore the state after each file. - self._filters_backup = self.filters[:] - self.counting = 'total' # In what way are we counting errors? - self.errors_by_category = {} # string to int dict storing error counts - - # output format: - # "emacs" - format that emacs can parse (default) - # "vs7" - format that Microsoft Visual Studio 7 can parse - self.output_format = 'emacs' - - def SetOutputFormat(self, output_format): - """Sets the output format for errors.""" - self.output_format = output_format - - def SetVerboseLevel(self, level): - """Sets the module's verbosity, and returns the previous setting.""" - last_verbose_level = self.verbose_level - self.verbose_level = level - return last_verbose_level - - def SetCountingStyle(self, counting_style): - """Sets the module's counting options.""" - self.counting = counting_style - - def SetFilters(self, filters): - """Sets the error-message filters. + """Maintains module-wide state..""" + + def __init__(self): + self.verbose_level = 1 # global setting. + self.error_count = 0 # global count of reported errors + # filters to apply when emitting error messages + self.filters = _DEFAULT_FILTERS[:] + # backup of filter list. Used to restore the state after each file. + self._filters_backup = self.filters[:] + self.counting = 'total' # In what way are we counting errors? + self.errors_by_category = {} # string to int dict storing error counts + + # output format: + # "emacs" - format that emacs can parse (default) + # "vs7" - format that Microsoft Visual Studio 7 can parse + self.output_format = 'emacs' + + def SetOutputFormat(self, output_format): + """Sets the output format for errors.""" + self.output_format = output_format + + def SetVerboseLevel(self, level): + """Sets the module's verbosity, and returns the previous setting.""" + last_verbose_level = self.verbose_level + self.verbose_level = level + return last_verbose_level + + def SetCountingStyle(self, counting_style): + """Sets the module's counting options.""" + self.counting = counting_style + + def SetFilters(self, filters): + """Sets the error-message filters. These filters are applied when deciding whether to emit a given error message. @@ -816,86 +813,88 @@ class _CppLintState(object): ValueError: The comma-separated filters did not all start with '+' or '-'. E.g. "-,+whitespace,-whitespace/indent,whitespace/badfilter" """ - # Default filters always have less priority than the flag ones. - self.filters = _DEFAULT_FILTERS[:] - self.AddFilters(filters) - - def AddFilters(self, filters): - """ Adds more filters to the existing list of error-message filters. """ - for filt in filters.split(','): - clean_filt = filt.strip() - if clean_filt: - self.filters.append(clean_filt) - for filt in self.filters: - if not (filt.startswith('+') or filt.startswith('-')): - raise ValueError('Every filter in --filters must start with + or -' - ' (%s does not)' % filt) - - def BackupFilters(self): - """ Saves the current filter list to backup storage.""" - self._filters_backup = self.filters[:] + # Default filters always have less priority than the flag ones. + self.filters = _DEFAULT_FILTERS[:] + self.AddFilters(filters) + + def AddFilters(self, filters): + """ Adds more filters to the existing list of error-message filters. """ + for filt in filters.split(','): + clean_filt = filt.strip() + if clean_filt: + self.filters.append(clean_filt) + for filt in self.filters: + if not (filt.startswith('+') or filt.startswith('-')): + raise ValueError( + 'Every filter in --filters must start with + or -' + ' (%s does not)' % filt) + + def BackupFilters(self): + """ Saves the current filter list to backup storage.""" + self._filters_backup = self.filters[:] + + def RestoreFilters(self): + """ Restores filters previously backed up.""" + self.filters = self._filters_backup[:] + + def ResetErrorCounts(self): + """Sets the module's error statistic back to zero.""" + self.error_count = 0 + self.errors_by_category = {} + + def IncrementErrorCount(self, category): + """Bumps the module's error statistic.""" + self.error_count += 1 + if self.counting in ('toplevel', 'detailed'): + if self.counting != 'detailed': + category = category.split('/')[0] + if category not in self.errors_by_category: + self.errors_by_category[category] = 0 + self.errors_by_category[category] += 1 + + def PrintErrorCounts(self): + """Print a summary of errors by category, and the total.""" + for category, count in self.errors_by_category.iteritems(): + sys.stdout.write('Category \'%s\' errors found: %d\n' % + (category, count)) + sys.stdout.write('Total errors found: %d\n' % self.error_count) - def RestoreFilters(self): - """ Restores filters previously backed up.""" - self.filters = self._filters_backup[:] - - def ResetErrorCounts(self): - """Sets the module's error statistic back to zero.""" - self.error_count = 0 - self.errors_by_category = {} - - def IncrementErrorCount(self, category): - """Bumps the module's error statistic.""" - self.error_count += 1 - if self.counting in ('toplevel', 'detailed'): - if self.counting != 'detailed': - category = category.split('/')[0] - if category not in self.errors_by_category: - self.errors_by_category[category] = 0 - self.errors_by_category[category] += 1 - - def PrintErrorCounts(self): - """Print a summary of errors by category, and the total.""" - for category, count in self.errors_by_category.iteritems(): - sys.stdout.write('Category \'%s\' errors found: %d\n' % - (category, count)) - sys.stdout.write('Total errors found: %d\n' % self.error_count) _cpplint_state = _CppLintState() def _OutputFormat(): - """Gets the module's output format.""" - return _cpplint_state.output_format + """Gets the module's output format.""" + return _cpplint_state.output_format def _SetOutputFormat(output_format): - """Sets the module's output format.""" - _cpplint_state.SetOutputFormat(output_format) + """Sets the module's output format.""" + _cpplint_state.SetOutputFormat(output_format) def _VerboseLevel(): - """Returns the module's verbosity setting.""" - return _cpplint_state.verbose_level + """Returns the module's verbosity setting.""" + return _cpplint_state.verbose_level def _SetVerboseLevel(level): - """Sets the module's verbosity, and returns the previous setting.""" - return _cpplint_state.SetVerboseLevel(level) + """Sets the module's verbosity, and returns the previous setting.""" + return _cpplint_state.SetVerboseLevel(level) def _SetCountingStyle(level): - """Sets the module's counting options.""" - _cpplint_state.SetCountingStyle(level) + """Sets the module's counting options.""" + _cpplint_state.SetCountingStyle(level) def _Filters(): - """Returns the module's list of output filters, as a list.""" - return _cpplint_state.filters + """Returns the module's list of output filters, as a list.""" + return _cpplint_state.filters def _SetFilters(filters): - """Sets the module's error-message filters. + """Sets the module's error-message filters. These filters are applied when deciding whether to emit a given error message. @@ -904,10 +903,11 @@ def _SetFilters(filters): filters: A string of comma-separated filters (eg "whitespace/indent"). Each filter should start with + or -; else we die. """ - _cpplint_state.SetFilters(filters) + _cpplint_state.SetFilters(filters) + def _AddFilters(filters): - """Adds more filter overrides. + """Adds more filter overrides. Unlike _SetFilters, this function does not reset the current list of filters available. @@ -916,93 +916,97 @@ def _AddFilters(filters): filters: A string of comma-separated filters (eg "whitespace/indent"). Each filter should start with + or -; else we die. """ - _cpplint_state.AddFilters(filters) + _cpplint_state.AddFilters(filters) + def _BackupFilters(): - """ Saves the current filter list to backup storage.""" - _cpplint_state.BackupFilters() + """ Saves the current filter list to backup storage.""" + _cpplint_state.BackupFilters() + def _RestoreFilters(): - """ Restores filters previously backed up.""" - _cpplint_state.RestoreFilters() + """ Restores filters previously backed up.""" + _cpplint_state.RestoreFilters() + class _FunctionState(object): - """Tracks current function name and the number of lines in its body.""" + """Tracks current function name and the number of lines in its body.""" - _NORMAL_TRIGGER = 250 # for --v=0, 500 for --v=1, etc. - _TEST_TRIGGER = 400 # about 50% more than _NORMAL_TRIGGER. + _NORMAL_TRIGGER = 250 # for --v=0, 500 for --v=1, etc. + _TEST_TRIGGER = 400 # about 50% more than _NORMAL_TRIGGER. - def __init__(self): - self.in_a_function = False - self.lines_in_function = 0 - self.current_function = '' + def __init__(self): + self.in_a_function = False + self.lines_in_function = 0 + self.current_function = '' - def Begin(self, function_name): - """Start analyzing function body. + def Begin(self, function_name): + """Start analyzing function body. Args: function_name: The name of the function being tracked. """ - self.in_a_function = True - self.lines_in_function = 0 - self.current_function = function_name + self.in_a_function = True + self.lines_in_function = 0 + self.current_function = function_name - def Count(self): - """Count line in current function body.""" - if self.in_a_function: - self.lines_in_function += 1 + def Count(self): + """Count line in current function body.""" + if self.in_a_function: + self.lines_in_function += 1 - def Check(self, error, filename, linenum): - """Report if too many lines in function body. + def Check(self, error, filename, linenum): + """Report if too many lines in function body. Args: error: The function to call with any errors found. filename: The name of the current file. linenum: The number of the line to check. """ - if Match(r'T(EST|est)', self.current_function): - base_trigger = self._TEST_TRIGGER - else: - base_trigger = self._NORMAL_TRIGGER - trigger = base_trigger * 2**_VerboseLevel() - - if self.lines_in_function > trigger: - error_level = int(math.log(self.lines_in_function / base_trigger, 2)) - # 50 => 0, 100 => 1, 200 => 2, 400 => 3, 800 => 4, 1600 => 5, ... - if error_level > 5: - error_level = 5 - error(filename, linenum, 'readability/fn_size', error_level, - 'Small and focused functions are preferred:' - ' %s has %d non-comment lines' - ' (error triggered by exceeding %d lines).' % ( - self.current_function, self.lines_in_function, trigger)) - - def End(self): - """Stop analyzing function body.""" - self.in_a_function = False + if Match(r'T(EST|est)', self.current_function): + base_trigger = self._TEST_TRIGGER + else: + base_trigger = self._NORMAL_TRIGGER + trigger = base_trigger * 2**_VerboseLevel() + + if self.lines_in_function > trigger: + error_level = int( + math.log(self.lines_in_function / base_trigger, 2)) + # 50 => 0, 100 => 1, 200 => 2, 400 => 3, 800 => 4, 1600 => 5, ... + if error_level > 5: + error_level = 5 + error(filename, linenum, 'readability/fn_size', error_level, + 'Small and focused functions are preferred:' + ' %s has %d non-comment lines' + ' (error triggered by exceeding %d lines).' % ( + self.current_function, self.lines_in_function, trigger)) + + def End(self): + """Stop analyzing function body.""" + self.in_a_function = False class _IncludeError(Exception): - """Indicates a problem with the include order in a file.""" - pass + """Indicates a problem with the include order in a file.""" + pass class FileInfo(object): - """Provides utility functions for filenames. + """Provides utility functions for filenames. FileInfo provides easy access to the components of a file's path relative to the project root. """ - def __init__(self, filename): - self._filename = filename + def __init__(self, filename): + self._filename = filename - def FullName(self): - """Make Windows paths like Unix.""" - return os.path.abspath(self._filename).replace('\\', '/') + def FullName(self): + """Make Windows paths like Unix.""" + return os.path.abspath(self._filename).replace('\\', '/') - def RepositoryName(self): - """FullName after removing the local path to the repository. + def RepositoryName(self): + """FullName after removing the local path to the repository. If we have a real absolute path name here we can try to do something smart: detecting the root of the checkout and truncating /path/to/checkout from @@ -1011,43 +1015,43 @@ class FileInfo(object): people on different computers who have checked the source out to different locations won't see bogus errors. """ - fullname = self.FullName() - - if os.path.exists(fullname): - project_dir = os.path.dirname(fullname) - - if os.path.exists(os.path.join(project_dir, ".svn")): - # If there's a .svn file in the current directory, we recursively look - # up the directory tree for the top of the SVN checkout - root_dir = project_dir - one_up_dir = os.path.dirname(root_dir) - while os.path.exists(os.path.join(one_up_dir, ".svn")): - root_dir = os.path.dirname(root_dir) - one_up_dir = os.path.dirname(one_up_dir) - - prefix = os.path.commonprefix([root_dir, project_dir]) - return fullname[len(prefix) + 1:] - - # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by - # searching up from the current path. - root_dir = os.path.dirname(fullname) - while (root_dir != os.path.dirname(root_dir) and - not os.path.exists(os.path.join(root_dir, ".git")) and - not os.path.exists(os.path.join(root_dir, ".hg")) and - not os.path.exists(os.path.join(root_dir, ".svn"))): - root_dir = os.path.dirname(root_dir) - - if (os.path.exists(os.path.join(root_dir, ".git")) or - os.path.exists(os.path.join(root_dir, ".hg")) or - os.path.exists(os.path.join(root_dir, ".svn"))): - prefix = os.path.commonprefix([root_dir, project_dir]) - return fullname[len(prefix) + 1:] - - # Don't know what to do; header guard warnings may be wrong... - return fullname - - def Split(self): - """Splits the file into the directory, basename, and extension. + fullname = self.FullName() + + if os.path.exists(fullname): + project_dir = os.path.dirname(fullname) + + if os.path.exists(os.path.join(project_dir, ".svn")): + # If there's a .svn file in the current directory, we recursively look + # up the directory tree for the top of the SVN checkout + root_dir = project_dir + one_up_dir = os.path.dirname(root_dir) + while os.path.exists(os.path.join(one_up_dir, ".svn")): + root_dir = os.path.dirname(root_dir) + one_up_dir = os.path.dirname(one_up_dir) + + prefix = os.path.commonprefix([root_dir, project_dir]) + return fullname[len(prefix) + 1:] + + # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by + # searching up from the current path. + root_dir = os.path.dirname(fullname) + while (root_dir != os.path.dirname(root_dir) and + not os.path.exists(os.path.join(root_dir, ".git")) and + not os.path.exists(os.path.join(root_dir, ".hg")) and + not os.path.exists(os.path.join(root_dir, ".svn"))): + root_dir = os.path.dirname(root_dir) + + if (os.path.exists(os.path.join(root_dir, ".git")) or + os.path.exists(os.path.join(root_dir, ".hg")) or + os.path.exists(os.path.join(root_dir, ".svn"))): + prefix = os.path.commonprefix([root_dir, project_dir]) + return fullname[len(prefix) + 1:] + + # Don't know what to do; header guard warnings may be wrong... + return fullname + + def Split(self): + """Splits the file into the directory, basename, and extension. For 'chrome/browser/browser.cc', Split() would return ('chrome/browser', 'browser', '.cc') @@ -1056,57 +1060,57 @@ class FileInfo(object): A tuple of (directory, basename, extension). """ - googlename = self.RepositoryName() - project, rest = os.path.split(googlename) - return (project,) + os.path.splitext(rest) + googlename = self.RepositoryName() + project, rest = os.path.split(googlename) + return (project, ) + os.path.splitext(rest) - def BaseName(self): - """File base name - text after the final slash, before the final period.""" - return self.Split()[1] + def BaseName(self): + """File base name - text after the final slash, before the final period.""" + return self.Split()[1] - def Extension(self): - """File extension - text following the final period.""" - return self.Split()[2] + def Extension(self): + """File extension - text following the final period.""" + return self.Split()[2] - def NoExtension(self): - """File has no source file extension.""" - return '/'.join(self.Split()[0:2]) + def NoExtension(self): + """File has no source file extension.""" + return '/'.join(self.Split()[0:2]) - def IsSource(self): - """File has a source file extension.""" - return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx') + def IsSource(self): + """File has a source file extension.""" + return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx') def _ShouldPrintError(category, confidence, linenum): - """If confidence >= verbose, category passes filter and is not suppressed.""" + """If confidence >= verbose, category passes filter and is not suppressed.""" - # There are three ways we might decide not to print an error message: - # a "NOLINT(category)" comment appears in the source, - # the verbosity level isn't high enough, or the filters filter it out. - if IsErrorSuppressedByNolint(category, linenum): - return False + # There are three ways we might decide not to print an error message: + # a "NOLINT(category)" comment appears in the source, + # the verbosity level isn't high enough, or the filters filter it out. + if IsErrorSuppressedByNolint(category, linenum): + return False - if confidence < _cpplint_state.verbose_level: - return False + if confidence < _cpplint_state.verbose_level: + return False - is_filtered = False - for one_filter in _Filters(): - if one_filter.startswith('-'): - if category.startswith(one_filter[1:]): - is_filtered = True - elif one_filter.startswith('+'): - if category.startswith(one_filter[1:]): - is_filtered = False - else: - assert False # should have been checked for in SetFilter. - if is_filtered: - return False + is_filtered = False + for one_filter in _Filters(): + if one_filter.startswith('-'): + if category.startswith(one_filter[1:]): + is_filtered = True + elif one_filter.startswith('+'): + if category.startswith(one_filter[1:]): + is_filtered = False + else: + assert False # should have been checked for in SetFilter. + if is_filtered: + return False - return True + return True def Error(filename, linenum, category, confidence, message): - """Logs the fact we've found a lint error. + """Logs the fact we've found a lint error. We log where the error was found, and also our confidence in the error, that is, how certain we are this is a legitimate style regression, and @@ -1127,17 +1131,17 @@ def Error(filename, linenum, category, confidence, message): and 1 meaning that it could be a legitimate construct. message: The error message. """ - if _ShouldPrintError(category, confidence, linenum): - _cpplint_state.IncrementErrorCount(category) - if _cpplint_state.output_format == 'vs7': - sys.stderr.write('%s(%s): %s [%s] [%d]\n' % ( - filename, linenum, message, category, confidence)) - elif _cpplint_state.output_format == 'eclipse': - sys.stderr.write('%s:%s: warning: %s [%s] [%d]\n' % ( - filename, linenum, message, category, confidence)) - else: - sys.stderr.write('%s:%s: %s [%s] [%d]\n' % ( - filename, linenum, message, category, confidence)) + if _ShouldPrintError(category, confidence, linenum): + _cpplint_state.IncrementErrorCount(category) + if _cpplint_state.output_format == 'vs7': + sys.stderr.write('%s(%s): %s [%s] [%d]\n' % + (filename, linenum, message, category, confidence)) + elif _cpplint_state.output_format == 'eclipse': + sys.stderr.write('%s:%s: warning: %s [%s] [%d]\n' % + (filename, linenum, message, category, confidence)) + else: + sys.stderr.write('%s:%s: %s [%s] [%d]\n' % + (filename, linenum, message, category, confidence)) # Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard. @@ -1154,14 +1158,13 @@ _RE_PATTERN_C_COMMENTS = r'/\*(?:[^*]|\*(?!/))*\*/' # if this doesn't work we try on left side but only if there's a non-character # on the right. _RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile( - r'(\s*' + _RE_PATTERN_C_COMMENTS + r'\s*$|' + - _RE_PATTERN_C_COMMENTS + r'\s+|' + - r'\s+' + _RE_PATTERN_C_COMMENTS + r'(?=\W)|' + + r'(\s*' + _RE_PATTERN_C_COMMENTS + r'\s*$|' + _RE_PATTERN_C_COMMENTS + + r'\s+|' + r'\s+' + _RE_PATTERN_C_COMMENTS + r'(?=\W)|' + _RE_PATTERN_C_COMMENTS + r')') def IsCppString(line): - """Does line terminate so, that the next symbol is in string constant. + """Does line terminate so, that the next symbol is in string constant. This function does not consider single-line nor multi-line comments. @@ -1173,12 +1176,12 @@ def IsCppString(line): string constant. """ - line = line.replace(r'\\', 'XX') # after this, \\" does not match to \" - return ((line.count('"') - line.count(r'\"') - line.count("'\"'")) & 1) == 1 + line = line.replace(r'\\', 'XX') # after this, \\" does not match to \" + return ((line.count('"') - line.count(r'\"') - line.count("'\"'")) & 1) == 1 def CleanseRawStrings(raw_lines): - """Removes C++11 raw strings from lines. + """Removes C++11 raw strings from lines. Before: static const char kData[] = R"( @@ -1197,98 +1200,100 @@ def CleanseRawStrings(raw_lines): list of lines with C++11 raw strings replaced by empty strings. """ - delimiter = None - lines_without_raw_strings = [] - for line in raw_lines: - if delimiter: - # Inside a raw string, look for the end - end = line.find(delimiter) - if end >= 0: - # Found the end of the string, match leading space for this - # line and resume copying the original lines, and also insert - # a "" on the last line. - leading_space = Match(r'^(\s*)\S', line) - line = leading_space.group(1) + '""' + line[end + len(delimiter):] - delimiter = None - else: - # Haven't found the end yet, append a blank line. - line = '""' - - # Look for beginning of a raw string, and replace them with - # empty strings. This is done in a loop to handle multiple raw - # strings on the same line. - while delimiter is None: - # Look for beginning of a raw string. - # See 2.14.15 [lex.string] for syntax. - matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line) - if matched: - delimiter = ')' + matched.group(2) + '"' - - end = matched.group(3).find(delimiter) - if end >= 0: - # Raw string ended on same line - line = (matched.group(1) + '""' + - matched.group(3)[end + len(delimiter):]) - delimiter = None - else: - # Start of a multi-line raw string - line = matched.group(1) + '""' - else: - break - - lines_without_raw_strings.append(line) - - # TODO(unknown): if delimiter is not None here, we might want to - # emit a warning for unterminated string. - return lines_without_raw_strings + delimiter = None + lines_without_raw_strings = [] + for line in raw_lines: + if delimiter: + # Inside a raw string, look for the end + end = line.find(delimiter) + if end >= 0: + # Found the end of the string, match leading space for this + # line and resume copying the original lines, and also insert + # a "" on the last line. + leading_space = Match(r'^(\s*)\S', line) + line = leading_space.group(1) + '""' + line[end + len( + delimiter):] + delimiter = None + else: + # Haven't found the end yet, append a blank line. + line = '""' + + # Look for beginning of a raw string, and replace them with + # empty strings. This is done in a loop to handle multiple raw + # strings on the same line. + while delimiter is None: + # Look for beginning of a raw string. + # See 2.14.15 [lex.string] for syntax. + matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', + line) + if matched: + delimiter = ')' + matched.group(2) + '"' + + end = matched.group(3).find(delimiter) + if end >= 0: + # Raw string ended on same line + line = (matched.group(1) + '""' + + matched.group(3)[end + len(delimiter):]) + delimiter = None + else: + # Start of a multi-line raw string + line = matched.group(1) + '""' + else: + break + + lines_without_raw_strings.append(line) + + # TODO(unknown): if delimiter is not None here, we might want to + # emit a warning for unterminated string. + return lines_without_raw_strings def FindNextMultiLineCommentStart(lines, lineix): - """Find the beginning marker for a multiline comment.""" - while lineix < len(lines): - if lines[lineix].strip().startswith('/*'): - # Only return this marker if the comment goes beyond this line - if lines[lineix].strip().find('*/', 2) < 0: - return lineix - lineix += 1 - return len(lines) + """Find the beginning marker for a multiline comment.""" + while lineix < len(lines): + if lines[lineix].strip().startswith('/*'): + # Only return this marker if the comment goes beyond this line + if lines[lineix].strip().find('*/', 2) < 0: + return lineix + lineix += 1 + return len(lines) def FindNextMultiLineCommentEnd(lines, lineix): - """We are inside a comment, find the end marker.""" - while lineix < len(lines): - if lines[lineix].strip().endswith('*/'): - return lineix - lineix += 1 - return len(lines) + """We are inside a comment, find the end marker.""" + while lineix < len(lines): + if lines[lineix].strip().endswith('*/'): + return lineix + lineix += 1 + return len(lines) def RemoveMultiLineCommentsFromRange(lines, begin, end): - """Clears a range of lines for multi-line comments.""" - # Having // dummy comments makes the lines non-empty, so we will not get - # unnecessary blank line warnings later in the code. - for i in range(begin, end): - lines[i] = '/**/' + """Clears a range of lines for multi-line comments.""" + # Having // dummy comments makes the lines non-empty, so we will not get + # unnecessary blank line warnings later in the code. + for i in range(begin, end): + lines[i] = '/**/' def RemoveMultiLineComments(filename, lines, error): - """Removes multiline (c-style) comments from lines.""" - lineix = 0 - while lineix < len(lines): - lineix_begin = FindNextMultiLineCommentStart(lines, lineix) - if lineix_begin >= len(lines): - return - lineix_end = FindNextMultiLineCommentEnd(lines, lineix_begin) - if lineix_end >= len(lines): - error(filename, lineix_begin + 1, 'readability/multiline_comment', 5, - 'Could not find end of multi-line comment') - return - RemoveMultiLineCommentsFromRange(lines, lineix_begin, lineix_end + 1) - lineix = lineix_end + 1 + """Removes multiline (c-style) comments from lines.""" + lineix = 0 + while lineix < len(lines): + lineix_begin = FindNextMultiLineCommentStart(lines, lineix) + if lineix_begin >= len(lines): + return + lineix_end = FindNextMultiLineCommentEnd(lines, lineix_begin) + if lineix_end >= len(lines): + error(filename, lineix_begin + 1, 'readability/multiline_comment', + 5, 'Could not find end of multi-line comment') + return + RemoveMultiLineCommentsFromRange(lines, lineix_begin, lineix_end + 1) + lineix = lineix_end + 1 def CleanseComments(line): - """Removes //-comments and single-line C-style /* */ comments. + """Removes //-comments and single-line C-style /* */ comments. Args: line: A line of C++ source. @@ -1296,15 +1301,15 @@ def CleanseComments(line): Returns: The line with single-line comments removed. """ - commentpos = line.find('//') - if commentpos != -1 and not IsCppString(line[:commentpos]): - line = line[:commentpos].rstrip() - # get rid of /* ... */ - return _RE_PATTERN_CLEANSE_LINE_C_COMMENTS.sub('', line) + commentpos = line.find('//') + if commentpos != -1 and not IsCppString(line[:commentpos]): + line = line[:commentpos].rstrip() + # get rid of /* ... */ + return _RE_PATTERN_CLEANSE_LINE_C_COMMENTS.sub('', line) class CleansedLines(object): - """Holds 4 copies of all lines with different preprocessing applied to them. + """Holds 4 copies of all lines with different preprocessing applied to them. 1) elided member contains lines without strings and comments. 2) lines member contains lines without comments. @@ -1314,25 +1319,26 @@ class CleansedLines(object): All these members are of , and of the same length. """ - def __init__(self, lines): - self.elided = [] - self.lines = [] - self.raw_lines = lines - self.num_lines = len(lines) - self.lines_without_raw_strings = CleanseRawStrings(lines) - for linenum in range(len(self.lines_without_raw_strings)): - self.lines.append(CleanseComments( - self.lines_without_raw_strings[linenum])) - elided = self._CollapseStrings(self.lines_without_raw_strings[linenum]) - self.elided.append(CleanseComments(elided)) - - def NumLines(self): - """Returns the number of lines represented.""" - return self.num_lines - - @staticmethod - def _CollapseStrings(elided): - """Collapses strings and chars on a line to simple "" or '' blocks. + def __init__(self, lines): + self.elided = [] + self.lines = [] + self.raw_lines = lines + self.num_lines = len(lines) + self.lines_without_raw_strings = CleanseRawStrings(lines) + for linenum in range(len(self.lines_without_raw_strings)): + self.lines.append( + CleanseComments(self.lines_without_raw_strings[linenum])) + elided = self._CollapseStrings(self.lines_without_raw_strings[ + linenum]) + self.elided.append(CleanseComments(elided)) + + def NumLines(self): + """Returns the number of lines represented.""" + return self.num_lines + + @staticmethod + def _CollapseStrings(elided): + """Collapses strings and chars on a line to simple "" or '' blocks. We nix strings first so we're not fooled by text like '"http://"' @@ -1342,64 +1348,65 @@ class CleansedLines(object): Returns: The line with collapsed strings. """ - if _RE_PATTERN_INCLUDE.match(elided): - return elided - - # Remove escaped characters first to make quote/single quote collapsing - # basic. Things that look like escaped characters shouldn't occur - # outside of strings and chars. - elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided) - - # Replace quoted strings and digit separators. Both single quotes - # and double quotes are processed in the same loop, otherwise - # nested quotes wouldn't work. - collapsed = '' - while True: - # Find the first quote character - match = Match(r'^([^\'"]*)([\'"])(.*)$', elided) - if not match: - collapsed += elided - break - head, quote, tail = match.groups() - - if quote == '"': - # Collapse double quoted strings - second_quote = tail.find('"') - if second_quote >= 0: - collapsed += head + '""' - elided = tail[second_quote + 1:] - else: - # Unmatched double quote, don't bother processing the rest - # of the line since this is probably a multiline string. - collapsed += elided - break - else: - # Found single quote, check nearby text to eliminate digit separators. - # - # There is no special handling for floating point here, because - # the integer/fractional/exponent parts would all be parsed - # correctly as long as there are digits on both sides of the - # separator. So we are fine as long as we don't see something - # like "0.'3" (gcc 4.9.0 will not allow this literal). - if Search(r'\b(?:0[bBxX]?|[1-9])[0-9a-fA-F]*$', head): - match_literal = Match(r'^((?:\'?[0-9a-zA-Z_])*)(.*)$', "'" + tail) - collapsed += head + match_literal.group(1).replace("'", '') - elided = match_literal.group(2) - else: - second_quote = tail.find('\'') - if second_quote >= 0: - collapsed += head + "''" - elided = tail[second_quote + 1:] - else: - # Unmatched single quote - collapsed += elided - break - - return collapsed + if _RE_PATTERN_INCLUDE.match(elided): + return elided + + # Remove escaped characters first to make quote/single quote collapsing + # basic. Things that look like escaped characters shouldn't occur + # outside of strings and chars. + elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided) + + # Replace quoted strings and digit separators. Both single quotes + # and double quotes are processed in the same loop, otherwise + # nested quotes wouldn't work. + collapsed = '' + while True: + # Find the first quote character + match = Match(r'^([^\'"]*)([\'"])(.*)$', elided) + if not match: + collapsed += elided + break + head, quote, tail = match.groups() + + if quote == '"': + # Collapse double quoted strings + second_quote = tail.find('"') + if second_quote >= 0: + collapsed += head + '""' + elided = tail[second_quote + 1:] + else: + # Unmatched double quote, don't bother processing the rest + # of the line since this is probably a multiline string. + collapsed += elided + break + else: + # Found single quote, check nearby text to eliminate digit separators. + # + # There is no special handling for floating point here, because + # the integer/fractional/exponent parts would all be parsed + # correctly as long as there are digits on both sides of the + # separator. So we are fine as long as we don't see something + # like "0.'3" (gcc 4.9.0 will not allow this literal). + if Search(r'\b(?:0[bBxX]?|[1-9])[0-9a-fA-F]*$', head): + match_literal = Match(r'^((?:\'?[0-9a-zA-Z_])*)(.*)$', + "'" + tail) + collapsed += head + match_literal.group(1).replace("'", '') + elided = match_literal.group(2) + else: + second_quote = tail.find('\'') + if second_quote >= 0: + collapsed += head + "''" + elided = tail[second_quote + 1:] + else: + # Unmatched single quote + collapsed += elided + break + + return collapsed def FindEndOfExpressionInLine(line, startpos, stack): - """Find the position just after the end of current parenthesized expression. + """Find the position just after the end of current parenthesized expression. Args: line: a CleansedLines line. @@ -1411,73 +1418,73 @@ def FindEndOfExpressionInLine(line, startpos, stack): On finding an unclosed expression: (-1, None) Otherwise: (-1, new stack at end of this line) """ - for i in xrange(startpos, len(line)): - char = line[i] - if char in '([{': - # Found start of parenthesized expression, push to expression stack - stack.append(char) - elif char == '<': - # Found potential start of template argument list - if i > 0 and line[i - 1] == '<': - # Left shift operator - if stack and stack[-1] == '<': - stack.pop() - if not stack: - return (-1, None) - elif i > 0 and Search(r'\boperator\s*$', line[0:i]): - # operator<, don't add to stack - continue - else: - # Tentative start of template argument list - stack.append('<') - elif char in ')]}': - # Found end of parenthesized expression. - # - # If we are currently expecting a matching '>', the pending '<' - # must have been an operator. Remove them from expression stack. - while stack and stack[-1] == '<': - stack.pop() - if not stack: - return (-1, None) - if ((stack[-1] == '(' and char == ')') or - (stack[-1] == '[' and char == ']') or - (stack[-1] == '{' and char == '}')): - stack.pop() - if not stack: - return (i + 1, None) - else: - # Mismatched parentheses - return (-1, None) - elif char == '>': - # Found potential end of template argument list. - - # Ignore "->" and operator functions - if (i > 0 and - (line[i - 1] == '-' or Search(r'\boperator\s*$', line[0:i - 1]))): - continue - - # Pop the stack if there is a matching '<'. Otherwise, ignore - # this '>' since it must be an operator. - if stack: - if stack[-1] == '<': - stack.pop() - if not stack: - return (i + 1, None) - elif char == ';': - # Found something that look like end of statements. If we are currently - # expecting a '>', the matching '<' must have been an operator, since - # template argument list should not contain statements. - while stack and stack[-1] == '<': - stack.pop() - if not stack: - return (-1, None) - - # Did not find end of expression or unbalanced parentheses on this line - return (-1, stack) + for i in xrange(startpos, len(line)): + char = line[i] + if char in '([{': + # Found start of parenthesized expression, push to expression stack + stack.append(char) + elif char == '<': + # Found potential start of template argument list + if i > 0 and line[i - 1] == '<': + # Left shift operator + if stack and stack[-1] == '<': + stack.pop() + if not stack: + return (-1, None) + elif i > 0 and Search(r'\boperator\s*$', line[0:i]): + # operator<, don't add to stack + continue + else: + # Tentative start of template argument list + stack.append('<') + elif char in ')]}': + # Found end of parenthesized expression. + # + # If we are currently expecting a matching '>', the pending '<' + # must have been an operator. Remove them from expression stack. + while stack and stack[-1] == '<': + stack.pop() + if not stack: + return (-1, None) + if ((stack[-1] == '(' and char == ')') or + (stack[-1] == '[' and char == ']') or + (stack[-1] == '{' and char == '}')): + stack.pop() + if not stack: + return (i + 1, None) + else: + # Mismatched parentheses + return (-1, None) + elif char == '>': + # Found potential end of template argument list. + + # Ignore "->" and operator functions + if (i > 0 and (line[i - 1] == '-' or Search(r'\boperator\s*$', + line[0:i - 1]))): + continue + + # Pop the stack if there is a matching '<'. Otherwise, ignore + # this '>' since it must be an operator. + if stack: + if stack[-1] == '<': + stack.pop() + if not stack: + return (i + 1, None) + elif char == ';': + # Found something that look like end of statements. If we are currently + # expecting a '>', the matching '<' must have been an operator, since + # template argument list should not contain statements. + while stack and stack[-1] == '<': + stack.pop() + if not stack: + return (-1, None) + + # Did not find end of expression or unbalanced parentheses on this line + return (-1, stack) def CloseExpression(clean_lines, linenum, pos): - """If input points to ( or { or [ or <, finds the position that closes it. + """If input points to ( or { or [ or <, finds the position that closes it. If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the linenum/pos that correspond to the closing of the expression. @@ -1499,29 +1506,29 @@ def CloseExpression(clean_lines, linenum, pos): 'cleansed' line at linenum. """ - line = clean_lines.elided[linenum] - if (line[pos] not in '({[<') or Match(r'<[<=]', line[pos:]): - return (line, clean_lines.NumLines(), -1) - - # Check first line - (end_pos, stack) = FindEndOfExpressionInLine(line, pos, []) - if end_pos > -1: - return (line, linenum, end_pos) - - # Continue scanning forward - while stack and linenum < clean_lines.NumLines() - 1: - linenum += 1 line = clean_lines.elided[linenum] - (end_pos, stack) = FindEndOfExpressionInLine(line, 0, stack) + if (line[pos] not in '({[<') or Match(r'<[<=]', line[pos:]): + return (line, clean_lines.NumLines(), -1) + + # Check first line + (end_pos, stack) = FindEndOfExpressionInLine(line, pos, []) if end_pos > -1: - return (line, linenum, end_pos) + return (line, linenum, end_pos) + + # Continue scanning forward + while stack and linenum < clean_lines.NumLines() - 1: + linenum += 1 + line = clean_lines.elided[linenum] + (end_pos, stack) = FindEndOfExpressionInLine(line, 0, stack) + if end_pos > -1: + return (line, linenum, end_pos) - # Did not find end of expression before end of file, give up - return (line, clean_lines.NumLines(), -1) + # Did not find end of expression before end of file, give up + return (line, clean_lines.NumLines(), -1) def FindStartOfExpressionInLine(line, endpos, stack): - """Find position at the matching start of current expression. + """Find position at the matching start of current expression. This is almost the reverse of FindEndOfExpressionInLine, but note that the input position and returned position differs by 1. @@ -1536,69 +1543,68 @@ def FindStartOfExpressionInLine(line, endpos, stack): On finding an unclosed expression: (-1, None) Otherwise: (-1, new stack at beginning of this line) """ - i = endpos - while i >= 0: - char = line[i] - if char in ')]}': - # Found end of expression, push to expression stack - stack.append(char) - elif char == '>': - # Found potential end of template argument list. - # - # Ignore it if it's a "->" or ">=" or "operator>" - if (i > 0 and - (line[i - 1] == '-' or - Match(r'\s>=\s', line[i - 1:]) or - Search(r'\boperator\s*$', line[0:i]))): - i -= 1 - else: - stack.append('>') - elif char == '<': - # Found potential start of template argument list - if i > 0 and line[i - 1] == '<': - # Left shift operator + i = endpos + while i >= 0: + char = line[i] + if char in ')]}': + # Found end of expression, push to expression stack + stack.append(char) + elif char == '>': + # Found potential end of template argument list. + # + # Ignore it if it's a "->" or ">=" or "operator>" + if (i > 0 and + (line[i - 1] == '-' or Match(r'\s>=\s', line[i - 1:]) or + Search(r'\boperator\s*$', line[0:i]))): + i -= 1 + else: + stack.append('>') + elif char == '<': + # Found potential start of template argument list + if i > 0 and line[i - 1] == '<': + # Left shift operator + i -= 1 + else: + # If there is a matching '>', we can pop the expression stack. + # Otherwise, ignore this '<' since it must be an operator. + if stack and stack[-1] == '>': + stack.pop() + if not stack: + return (i, None) + elif char in '([{': + # Found start of expression. + # + # If there are any unmatched '>' on the stack, they must be + # operators. Remove those. + while stack and stack[-1] == '>': + stack.pop() + if not stack: + return (-1, None) + if ((char == '(' and stack[-1] == ')') or + (char == '[' and stack[-1] == ']') or + (char == '{' and stack[-1] == '}')): + stack.pop() + if not stack: + return (i, None) + else: + # Mismatched parentheses + return (-1, None) + elif char == ';': + # Found something that look like end of statements. If we are currently + # expecting a '<', the matching '>' must have been an operator, since + # template argument list should not contain statements. + while stack and stack[-1] == '>': + stack.pop() + if not stack: + return (-1, None) + i -= 1 - else: - # If there is a matching '>', we can pop the expression stack. - # Otherwise, ignore this '<' since it must be an operator. - if stack and stack[-1] == '>': - stack.pop() - if not stack: - return (i, None) - elif char in '([{': - # Found start of expression. - # - # If there are any unmatched '>' on the stack, they must be - # operators. Remove those. - while stack and stack[-1] == '>': - stack.pop() - if not stack: - return (-1, None) - if ((char == '(' and stack[-1] == ')') or - (char == '[' and stack[-1] == ']') or - (char == '{' and stack[-1] == '}')): - stack.pop() - if not stack: - return (i, None) - else: - # Mismatched parentheses - return (-1, None) - elif char == ';': - # Found something that look like end of statements. If we are currently - # expecting a '<', the matching '>' must have been an operator, since - # template argument list should not contain statements. - while stack and stack[-1] == '>': - stack.pop() - if not stack: - return (-1, None) - - i -= 1 - - return (-1, stack) + + return (-1, stack) def ReverseCloseExpression(clean_lines, linenum, pos): - """If input points to ) or } or ] or >, finds the position that opens it. + """If input points to ) or } or ] or >, finds the position that opens it. If lines[linenum][pos] points to a ')' or '}' or ']' or '>', finds the linenum/pos that correspond to the opening of the expression. @@ -1614,42 +1620,42 @@ def ReverseCloseExpression(clean_lines, linenum, pos): we ignore strings and comments when matching; and the line we return is the 'cleansed' line at linenum. """ - line = clean_lines.elided[linenum] - if line[pos] not in ')}]>': - return (line, 0, -1) - - # Check last line - (start_pos, stack) = FindStartOfExpressionInLine(line, pos, []) - if start_pos > -1: - return (line, linenum, start_pos) - - # Continue scanning backward - while stack and linenum > 0: - linenum -= 1 line = clean_lines.elided[linenum] - (start_pos, stack) = FindStartOfExpressionInLine(line, len(line) - 1, stack) - if start_pos > -1: - return (line, linenum, start_pos) + if line[pos] not in ')}]>': + return (line, 0, -1) - # Did not find start of expression before beginning of file, give up - return (line, 0, -1) + # Check last line + (start_pos, stack) = FindStartOfExpressionInLine(line, pos, []) + if start_pos > -1: + return (line, linenum, start_pos) + + # Continue scanning backward + while stack and linenum > 0: + linenum -= 1 + line = clean_lines.elided[linenum] + (start_pos, stack) = FindStartOfExpressionInLine(line, + len(line) - 1, stack) + if start_pos > -1: + return (line, linenum, start_pos) + + # Did not find start of expression before beginning of file, give up + return (line, 0, -1) def CheckForCopyright(filename, lines, error): - """Logs an error if no Copyright message appears at the top of the file.""" + """Logs an error if no Copyright message appears at the top of the file.""" - # We'll say it should occur by line 10. Don't forget there's a - # dummy line at the front. - for line in xrange(1, min(len(lines), 11)): - if re.search(r'Copyright', lines[line], re.I): break - else: # means no copyright line was found - error(filename, 0, 'legal/copyright', 5, - 'No copyright message found. ' - 'You should have a line: "Copyright [year] "') + # We'll say it should occur by line 10. Don't forget there's a + # dummy line at the front. + for line in xrange(1, min(len(lines), 11)): + if re.search(r'Copyright', lines[line], re.I): break + else: # means no copyright line was found + error(filename, 0, 'legal/copyright', 5, 'No copyright message found. ' + 'You should have a line: "Copyright [year] "') def GetIndentLevel(line): - """Return the number of leading spaces in line. + """Return the number of leading spaces in line. Args: line: A string to check. @@ -1657,15 +1663,15 @@ def GetIndentLevel(line): Returns: An integer count of leading spaces, possibly zero. """ - indent = Match(r'^( *)\S', line) - if indent: - return len(indent.group(1)) - else: - return 0 + indent = Match(r'^( *)\S', line) + if indent: + return len(indent.group(1)) + else: + return 0 def GetHeaderGuardCPPVariable(filename): - """Returns the CPP variable that should be used as a header guard. + """Returns the CPP variable that should be used as a header guard. Args: filename: The name of a C++ header file. @@ -1675,12 +1681,12 @@ def GetHeaderGuardCPPVariable(filename): named file. """ - filename = os.path.basename(filename) - return re.sub(r'[^a-zA-Z0-9]', '_', filename).upper() + '_' + filename = os.path.basename(filename) + return re.sub(r'[^a-zA-Z0-9]', '_', filename).upper() + '_' def CheckForHeaderGuard(filename, clean_lines, error): - """Checks that the file contains a header guard. + """Checks that the file contains a header guard. Logs an error if no #ifndef header guard is present. For other headers, checks that the full pathname is used. @@ -1691,123 +1697,124 @@ def CheckForHeaderGuard(filename, clean_lines, error): error: The function to call with any errors found. """ - # Don't check for header guards if there are error suppression - # comments somewhere in this file. - # - # Because this is silencing a warning for a nonexistent line, we - # only support the very specific NOLINT(build/header_guard) syntax, - # and not the general NOLINT or NOLINT(*) syntax. - raw_lines = clean_lines.lines_without_raw_strings - for i in raw_lines: - if Search(r'//\s*NOLINT\(build/header_guard\)', i): - return - - cppvar = GetHeaderGuardCPPVariable(filename) - - ifndef = '' - ifndef_linenum = 0 - define = '' - endif = '' - endif_linenum = 0 - pragma_linenum = -1 - for linenum, line in enumerate(raw_lines): - linesplit = line.split() - if len(linesplit) >= 2: - if linesplit[0] == '#pragma' and linesplit[1] == 'once': - pragma_linenum = linenum - # find the first occurrence of #ifndef and #define, save arg - if not ifndef and linesplit[0] == '#ifndef': - # set ifndef to the header guard presented on the #ifndef line. - ifndef = linesplit[1] - ifndef_linenum = linenum - if not define and linesplit[0] == '#define': - define = linesplit[1] - # find the last occurrence of #endif, save entire line - if line.startswith('#endif'): - endif = line - endif_linenum = linenum - if pragma_linenum != -1: - return # short path for pragma once - if not ifndef or not define or ifndef != define: - error(filename, 0, 'build/header_guard', 5, - 'No #ifndef header guard found, suggested CPP variable is: %s' % - cppvar) - return - - # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__ - # for backward compatibility. - if ifndef != cppvar: - error_level = 0 - if ifndef != cppvar + '_': - error_level = 5 - - ParseNolintSuppressions(filename, raw_lines[ifndef_linenum], ifndef_linenum, + # Don't check for header guards if there are error suppression + # comments somewhere in this file. + # + # Because this is silencing a warning for a nonexistent line, we + # only support the very specific NOLINT(build/header_guard) syntax, + # and not the general NOLINT or NOLINT(*) syntax. + raw_lines = clean_lines.lines_without_raw_strings + for i in raw_lines: + if Search(r'//\s*NOLINT\(build/header_guard\)', i): + return + + cppvar = GetHeaderGuardCPPVariable(filename) + + ifndef = '' + ifndef_linenum = 0 + define = '' + endif = '' + endif_linenum = 0 + pragma_linenum = -1 + for linenum, line in enumerate(raw_lines): + linesplit = line.split() + if len(linesplit) >= 2: + if linesplit[0] == '#pragma' and linesplit[1] == 'once': + pragma_linenum = linenum + # find the first occurrence of #ifndef and #define, save arg + if not ifndef and linesplit[0] == '#ifndef': + # set ifndef to the header guard presented on the #ifndef line. + ifndef = linesplit[1] + ifndef_linenum = linenum + if not define and linesplit[0] == '#define': + define = linesplit[1] + # find the last occurrence of #endif, save entire line + if line.startswith('#endif'): + endif = line + endif_linenum = linenum + if pragma_linenum != -1: + return # short path for pragma once + if not ifndef or not define or ifndef != define: + error(filename, 0, 'build/header_guard', 5, + 'No #ifndef header guard found, suggested CPP variable is: %s' % + cppvar) + return + + # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__ + # for backward compatibility. + if ifndef != cppvar: + error_level = 0 + if ifndef != cppvar + '_': + error_level = 5 + + ParseNolintSuppressions(filename, raw_lines[ifndef_linenum], + ifndef_linenum, error) + error(filename, ifndef_linenum, 'build/header_guard', error_level, + '#ifndef header guard has wrong style, please use: %s' % cppvar) + + # Check for "//" comments on endif line. + ParseNolintSuppressions(filename, raw_lines[endif_linenum], endif_linenum, error) - error(filename, ifndef_linenum, 'build/header_guard', error_level, - '#ifndef header guard has wrong style, please use: %s' % cppvar) - - # Check for "//" comments on endif line. - ParseNolintSuppressions(filename, raw_lines[endif_linenum], endif_linenum, - error) - match = Match(r'#endif\s*//\s*' + cppvar + r'(_)?\b', endif) - if match: - if match.group(1) == '_': - # Issue low severity warning for deprecated double trailing underscore - error(filename, endif_linenum, 'build/header_guard', 0, - '#endif line should be "#endif // %s"' % cppvar) - return - - # Didn't find the corresponding "//" comment. If this file does not - # contain any "//" comments at all, it could be that the compiler - # only wants "/**/" comments, look for those instead. - no_single_line_comments = True - for i in xrange(1, len(raw_lines) - 1): - line = raw_lines[i] - if Match(r'^(?:(?:\'(?:\.|[^\'])*\')|(?:"(?:\.|[^"])*")|[^\'"])*//', line): - no_single_line_comments = False - break - - if no_single_line_comments: - match = Match(r'#endif\s*/\*\s*' + cppvar + r'(_)?\s*\*/', endif) + match = Match(r'#endif\s*//\s*' + cppvar + r'(_)?\b', endif) if match: - if match.group(1) == '_': - # Low severity warning for double trailing underscore - error(filename, endif_linenum, 'build/header_guard', 0, - '#endif line should be "#endif /* %s */"' % cppvar) - return + if match.group(1) == '_': + # Issue low severity warning for deprecated double trailing underscore + error(filename, endif_linenum, 'build/header_guard', 0, + '#endif line should be "#endif // %s"' % cppvar) + return - # Didn't find anything - error(filename, endif_linenum, 'build/header_guard', 5, - '#endif line should be "#endif // %s"' % cppvar) + # Didn't find the corresponding "//" comment. If this file does not + # contain any "//" comments at all, it could be that the compiler + # only wants "/**/" comments, look for those instead. + no_single_line_comments = True + for i in xrange(1, len(raw_lines) - 1): + line = raw_lines[i] + if Match(r'^(?:(?:\'(?:\.|[^\'])*\')|(?:"(?:\.|[^"])*")|[^\'"])*//', + line): + no_single_line_comments = False + break + + if no_single_line_comments: + match = Match(r'#endif\s*/\*\s*' + cppvar + r'(_)?\s*\*/', endif) + if match: + if match.group(1) == '_': + # Low severity warning for double trailing underscore + error(filename, endif_linenum, 'build/header_guard', 0, + '#endif line should be "#endif /* %s */"' % cppvar) + return + + # Didn't find anything + error(filename, endif_linenum, 'build/header_guard', 5, + '#endif line should be "#endif // %s"' % cppvar) def CheckHeaderFileIncluded(filename, include_state, error): - """Logs an error if a .cc file does not include its header.""" - - # Do not check test files - if filename.endswith('_test.cc') or filename.endswith('_unittest.cc'): - return - - fileinfo = FileInfo(filename) - headerfile = filename[0:len(filename) - 2] + 'h' - if not os.path.exists(headerfile): - return - headername = FileInfo(headerfile).RepositoryName() - first_include = 0 - for section_list in include_state.include_list: - for f in section_list: - if headername in f[0] or f[0] in headername: + """Logs an error if a .cc file does not include its header.""" + + # Do not check test files + if filename.endswith('_test.cc') or filename.endswith('_unittest.cc'): + return + + fileinfo = FileInfo(filename) + headerfile = filename[0:len(filename) - 2] + 'h' + if not os.path.exists(headerfile): return - if not first_include: - first_include = f[1] + headername = FileInfo(headerfile).RepositoryName() + first_include = 0 + for section_list in include_state.include_list: + for f in section_list: + if headername in f[0] or f[0] in headername: + return + if not first_include: + first_include = f[1] - error(filename, first_include, 'build/include', 5, - '%s should include its header file %s' % (fileinfo.RepositoryName(), - headername)) + error(filename, first_include, 'build/include', 5, + '%s should include its header file %s' % (fileinfo.RepositoryName(), + headername)) def CheckForBadCharacters(filename, lines, error): - """Logs an error for each line containing bad characters. + """Logs an error for each line containing bad characters. Two kinds of bad characters: @@ -1823,16 +1830,19 @@ def CheckForBadCharacters(filename, lines, error): lines: An array of strings, each representing a line of the file. error: The function to call with any errors found. """ - for linenum, line in enumerate(lines): - if u'\ufffd' in line: - error(filename, linenum, 'readability/utf8', 5, - 'Line contains invalid UTF-8 (or Unicode replacement character).') - if '\0' in line: - error(filename, linenum, 'readability/nul', 5, 'Line contains NUL byte.') + for linenum, line in enumerate(lines): + if u'\ufffd' in line: + error( + filename, linenum, 'readability/utf8', 5, + 'Line contains invalid UTF-8 (or Unicode replacement character).' + ) + if '\0' in line: + error(filename, linenum, 'readability/nul', 5, + 'Line contains NUL byte.') def CheckForNewlineAtEOF(filename, lines, error): - """Logs an error if there is no newline char at the end of the file. + """Logs an error if there is no newline char at the end of the file. Args: filename: The name of the current file. @@ -1840,17 +1850,18 @@ def CheckForNewlineAtEOF(filename, lines, error): error: The function to call with any errors found. """ - # The array lines() was created by adding two newlines to the - # original file (go figure), then splitting on \n. - # To verify that the file ends in \n, we just have to make sure the - # last-but-two element of lines() exists and is empty. - if len(lines) < 3 or lines[-2]: - error(filename, len(lines) - 2, 'whitespace/ending_newline', 5, - 'Could not find a newline character at the end of the file.') + # The array lines() was created by adding two newlines to the + # original file (go figure), then splitting on \n. + # To verify that the file ends in \n, we just have to make sure the + # last-but-two element of lines() exists and is empty. + if len(lines) < 3 or lines[-2]: + error(filename, + len(lines) - 2, 'whitespace/ending_newline', 5, + 'Could not find a newline character at the end of the file.') def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error): - """Logs an error if we see /* ... */ or "..." that extend past one line. + """Logs an error if we see /* ... */ or "..." that extend past one line. /* ... */ comments are legit inside macros, for one line. Otherwise, we prefer // comments, so it's ok to warn about the @@ -1866,25 +1877,25 @@ def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error): linenum: The number of the line to check. error: The function to call with any errors found. """ - line = clean_lines.elided[linenum] + line = clean_lines.elided[linenum] - # Remove all \\ (escaped backslashes) from the line. They are OK, and the - # second (escaped) slash may trigger later \" detection erroneously. - line = line.replace('\\\\', '') + # Remove all \\ (escaped backslashes) from the line. They are OK, and the + # second (escaped) slash may trigger later \" detection erroneously. + line = line.replace('\\\\', '') - if line.count('/*') > line.count('*/'): - error(filename, linenum, 'readability/multiline_comment', 5, - 'Complex multi-line /*...*/-style comment found. ' - 'Lint may give bogus warnings. ' - 'Consider replacing these with //-style comments, ' - 'with #if 0...#endif, ' - 'or with more clearly structured multi-line comments.') + if line.count('/*') > line.count('*/'): + error(filename, linenum, 'readability/multiline_comment', 5, + 'Complex multi-line /*...*/-style comment found. ' + 'Lint may give bogus warnings. ' + 'Consider replacing these with //-style comments, ' + 'with #if 0...#endif, ' + 'or with more clearly structured multi-line comments.') - if (line.count('"') - line.count('\\"')) % 2: - error(filename, linenum, 'readability/multiline_string', 5, - 'Multi-line string ("...") found. This lint script doesn\'t ' - 'do well with such strings, and may give bogus warnings. ' - 'Use C++11 raw strings or concatenation instead.') + if (line.count('"') - line.count('\\"')) % 2: + error(filename, linenum, 'readability/multiline_string', 5, + 'Multi-line string ("...") found. This lint script doesn\'t ' + 'do well with such strings, and may give bogus warnings. ' + 'Use C++11 raw strings or concatenation instead.') # (non-threadsafe name, thread-safe alternative, validation pattern) @@ -1911,14 +1922,12 @@ _THREADING_LIST = ( ('gmtime(', 'gmtime_r(', _UNSAFE_FUNC_PREFIX + r'gmtime\([^)]+\)'), ('localtime(', 'localtime_r(', _UNSAFE_FUNC_PREFIX + r'localtime\([^)]+\)'), ('rand(', 'rand_r(', _UNSAFE_FUNC_PREFIX + r'rand\(\)'), - ('strtok(', 'strtok_r(', - _UNSAFE_FUNC_PREFIX + r'strtok\([^)]+\)'), - ('ttyname(', 'ttyname_r(', _UNSAFE_FUNC_PREFIX + r'ttyname\([^)]+\)'), - ) + ('strtok(', 'strtok_r(', _UNSAFE_FUNC_PREFIX + r'strtok\([^)]+\)'), + ('ttyname(', 'ttyname_r(', _UNSAFE_FUNC_PREFIX + r'ttyname\([^)]+\)'), ) def CheckPosixThreading(filename, clean_lines, linenum, error): - """Checks for calls to thread-unsafe functions. + """Checks for calls to thread-unsafe functions. Much code has been originally written without consideration of multi-threading. Also, engineers are relying on their old experience; @@ -1932,19 +1941,18 @@ def CheckPosixThreading(filename, clean_lines, linenum, error): linenum: The number of the line to check. error: The function to call with any errors found. """ - line = clean_lines.elided[linenum] - for single_thread_func, multithread_safe_func, pattern in _THREADING_LIST: - # Additional pattern matching check to confirm that this is the - # function we are looking for - if Search(pattern, line): - error(filename, linenum, 'runtime/threadsafe_fn', 2, - 'Consider using ' + multithread_safe_func + - '...) instead of ' + single_thread_func + - '...) for improved thread safety.') + line = clean_lines.elided[linenum] + for single_thread_func, multithread_safe_func, pattern in _THREADING_LIST: + # Additional pattern matching check to confirm that this is the + # function we are looking for + if Search(pattern, line): + error(filename, linenum, 'runtime/threadsafe_fn', 2, + 'Consider using ' + multithread_safe_func + '...) instead of ' + + single_thread_func + '...) for improved thread safety.') def CheckVlogArguments(filename, clean_lines, linenum, error): - """Checks that VLOG() is only used for defining a logging level. + """Checks that VLOG() is only used for defining a logging level. For example, VLOG(2) is correct. VLOG(INFO), VLOG(WARNING), VLOG(ERROR), and VLOG(FATAL) are not. @@ -1955,20 +1963,20 @@ def CheckVlogArguments(filename, clean_lines, linenum, error): linenum: The number of the line to check. error: The function to call with any errors found. """ - line = clean_lines.elided[linenum] - if Search(r'\bVLOG\((INFO|ERROR|WARNING|DFATAL|FATAL)\)', line): - error(filename, linenum, 'runtime/vlog', 5, - 'VLOG() should be used with numeric verbosity level. ' - 'Use LOG() if you want symbolic severity levels.') + line = clean_lines.elided[linenum] + if Search(r'\bVLOG\((INFO|ERROR|WARNING|DFATAL|FATAL)\)', line): + error(filename, linenum, 'runtime/vlog', 5, + 'VLOG() should be used with numeric verbosity level. ' + 'Use LOG() if you want symbolic severity levels.') + # Matches invalid increment: *count++, which moves pointer instead of # incrementing a value. -_RE_PATTERN_INVALID_INCREMENT = re.compile( - r'^\s*\*\w+(\+\+|--);') +_RE_PATTERN_INVALID_INCREMENT = re.compile(r'^\s*\*\w+(\+\+|--);') def CheckInvalidIncrement(filename, clean_lines, linenum, error): - """Checks for invalid increment *count++. + """Checks for invalid increment *count++. For example following function: void increment_counter(int* count) { @@ -1983,37 +1991,38 @@ def CheckInvalidIncrement(filename, clean_lines, linenum, error): linenum: The number of the line to check. error: The function to call with any errors found. """ - line = clean_lines.elided[linenum] - if _RE_PATTERN_INVALID_INCREMENT.match(line): - error(filename, linenum, 'runtime/invalid_increment', 5, - 'Changing pointer instead of value (or unused value of operator*).') + line = clean_lines.elided[linenum] + if _RE_PATTERN_INVALID_INCREMENT.match(line): + error( + filename, linenum, 'runtime/invalid_increment', 5, + 'Changing pointer instead of value (or unused value of operator*).') def IsMacroDefinition(clean_lines, linenum): - if Search(r'^#define', clean_lines[linenum]): - return True + if Search(r'^#define', clean_lines[linenum]): + return True - if linenum > 0 and Search(r'\\$', clean_lines[linenum - 1]): - return True + if linenum > 0 and Search(r'\\$', clean_lines[linenum - 1]): + return True - return False + return False def IsForwardClassDeclaration(clean_lines, linenum): - return Match(r'^\s*(\btemplate\b)*.*class\s+\w+;\s*$', clean_lines[linenum]) + return Match(r'^\s*(\btemplate\b)*.*class\s+\w+;\s*$', clean_lines[linenum]) class _BlockInfo(object): - """Stores information about a generic block of code.""" + """Stores information about a generic block of code.""" - def __init__(self, seen_open_brace): - self.seen_open_brace = seen_open_brace - self.open_parentheses = 0 - self.inline_asm = _NO_ASM - self.check_namespace_indentation = False + def __init__(self, seen_open_brace): + self.seen_open_brace = seen_open_brace + self.open_parentheses = 0 + self.inline_asm = _NO_ASM + self.check_namespace_indentation = False - def CheckBegin(self, filename, clean_lines, linenum, error): - """Run checks that applies to text up to the opening brace. + def CheckBegin(self, filename, clean_lines, linenum, error): + """Run checks that applies to text up to the opening brace. This is mostly for checking the text after the class identifier and the "{", usually where the base class is specified. For other @@ -2025,10 +2034,10 @@ class _BlockInfo(object): linenum: The number of the line to check. error: The function to call with any errors found. """ - pass + pass - def CheckEnd(self, filename, clean_lines, linenum, error): - """Run checks that applies to text after the closing brace. + def CheckEnd(self, filename, clean_lines, linenum, error): + """Run checks that applies to text after the closing brace. This is mostly used for checking end of namespace comments. @@ -2038,10 +2047,10 @@ class _BlockInfo(object): linenum: The number of the line to check. error: The function to call with any errors found. """ - pass + pass - def IsBlockInfo(self): - """Returns true if this block is a _BlockInfo. + def IsBlockInfo(self): + """Returns true if this block is a _BlockInfo. This is convenient for verifying that an object is an instance of a _BlockInfo, but not an instance of any of the derived classes. @@ -2049,231 +2058,235 @@ class _BlockInfo(object): Returns: True for this class, False for derived classes. """ - return self.__class__ == _BlockInfo + return self.__class__ == _BlockInfo class _ExternCInfo(_BlockInfo): - """Stores information about an 'extern "C"' block.""" + """Stores information about an 'extern "C"' block.""" - def __init__(self): - _BlockInfo.__init__(self, True) + def __init__(self): + _BlockInfo.__init__(self, True) class _ClassInfo(_BlockInfo): - """Stores information about a class.""" - - def __init__(self, name, class_or_struct, clean_lines, linenum): - _BlockInfo.__init__(self, False) - self.name = name - self.starting_linenum = linenum - self.is_derived = False - self.check_namespace_indentation = True - if class_or_struct == 'struct': - self.access = 'public' - self.is_struct = True - else: - self.access = 'private' - self.is_struct = False + """Stores information about a class.""" + + def __init__(self, name, class_or_struct, clean_lines, linenum): + _BlockInfo.__init__(self, False) + self.name = name + self.starting_linenum = linenum + self.is_derived = False + self.check_namespace_indentation = True + if class_or_struct == 'struct': + self.access = 'public' + self.is_struct = True + else: + self.access = 'private' + self.is_struct = False - # Remember initial indentation level for this class. Using raw_lines here - # instead of elided to account for leading comments. - self.class_indent = GetIndentLevel(clean_lines.raw_lines[linenum]) + # Remember initial indentation level for this class. Using raw_lines here + # instead of elided to account for leading comments. + self.class_indent = GetIndentLevel(clean_lines.raw_lines[linenum]) - # Try to find the end of the class. This will be confused by things like: - # class A { - # } *x = { ... - # - # But it's still good enough for CheckSectionSpacing. - self.last_line = 0 - depth = 0 - for i in range(linenum, clean_lines.NumLines()): - line = clean_lines.elided[i] - depth += line.count('{') - line.count('}') - if not depth: - self.last_line = i - break - - def CheckBegin(self, filename, clean_lines, linenum, error): - # Look for a bare ':' - if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]): - self.is_derived = True - - def CheckEnd(self, filename, clean_lines, linenum, error): - # If there is a DISALLOW macro, it should appear near the end of - # the class. - seen_last_thing_in_class = False - for i in xrange(linenum - 1, self.starting_linenum, -1): - match = Search( - r'\b(DISALLOW_COPY_AND_ASSIGN|DISALLOW_IMPLICIT_CONSTRUCTORS)\(' + - self.name + r'\)', - clean_lines.elided[i]) - if match: - if seen_last_thing_in_class: - error(filename, i, 'readability/constructors', 3, - match.group(1) + ' should be the last thing in the class') - break - - if not Match(r'^\s*$', clean_lines.elided[i]): - seen_last_thing_in_class = True - - # Check that closing brace is aligned with beginning of the class. - # Only do this if the closing brace is indented by only whitespaces. - # This means we will not check single-line class definitions. - indent = Match(r'^( *)\}', clean_lines.elided[linenum]) - if indent and len(indent.group(1)) != self.class_indent: - if self.is_struct: - parent = 'struct ' + self.name - else: - parent = 'class ' + self.name - error(filename, linenum, 'whitespace/indent', 3, - 'Closing brace should be aligned with beginning of %s' % parent) + # Try to find the end of the class. This will be confused by things like: + # class A { + # } *x = { ... + # + # But it's still good enough for CheckSectionSpacing. + self.last_line = 0 + depth = 0 + for i in range(linenum, clean_lines.NumLines()): + line = clean_lines.elided[i] + depth += line.count('{') - line.count('}') + if not depth: + self.last_line = i + break + + def CheckBegin(self, filename, clean_lines, linenum, error): + # Look for a bare ':' + if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]): + self.is_derived = True + + def CheckEnd(self, filename, clean_lines, linenum, error): + # If there is a DISALLOW macro, it should appear near the end of + # the class. + seen_last_thing_in_class = False + for i in xrange(linenum - 1, self.starting_linenum, -1): + match = Search( + r'\b(DISALLOW_COPY_AND_ASSIGN|DISALLOW_IMPLICIT_CONSTRUCTORS)\(' + + self.name + r'\)', clean_lines.elided[i]) + if match: + if seen_last_thing_in_class: + error(filename, i, 'readability/constructors', 3, + match.group(1) + + ' should be the last thing in the class') + break + + if not Match(r'^\s*$', clean_lines.elided[i]): + seen_last_thing_in_class = True + + # Check that closing brace is aligned with beginning of the class. + # Only do this if the closing brace is indented by only whitespaces. + # This means we will not check single-line class definitions. + indent = Match(r'^( *)\}', clean_lines.elided[linenum]) + if indent and len(indent.group(1)) != self.class_indent: + if self.is_struct: + parent = 'struct ' + self.name + else: + parent = 'class ' + self.name + error(filename, linenum, 'whitespace/indent', 3, + 'Closing brace should be aligned with beginning of %s' % + parent) class _NamespaceInfo(_BlockInfo): - """Stores information about a namespace.""" - - def __init__(self, name, linenum): - _BlockInfo.__init__(self, False) - self.name = name or '' - self.starting_linenum = linenum - self.check_namespace_indentation = True - - def CheckEnd(self, filename, clean_lines, linenum, error): - """Check end of namespace comments.""" - line = clean_lines.raw_lines[linenum] - - # Check how many lines is enclosed in this namespace. Don't issue - # warning for missing namespace comments if there aren't enough - # lines. However, do apply checks if there is already an end of - # namespace comment and it's incorrect. - # - # TODO(unknown): We always want to check end of namespace comments - # if a namespace is large, but sometimes we also want to apply the - # check if a short namespace contained nontrivial things (something - # other than forward declarations). There is currently no logic on - # deciding what these nontrivial things are, so this check is - # triggered by namespace size only, which works most of the time. - if (linenum - self.starting_linenum < 10 - and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)): - return - - # Look for matching comment at end of namespace. - # - # Note that we accept C style "/* */" comments for terminating - # namespaces, so that code that terminate namespaces inside - # preprocessor macros can be cpplint clean. - # - # We also accept stuff like "// end of namespace ." with the - # period at the end. - # - # Besides these, we don't accept anything else, otherwise we might - # get false negatives when existing comment is a substring of the - # expected namespace. - if self.name: - # Named namespace - if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) + - r'[\*/\.\\\s]*$'), - line): - error(filename, linenum, 'readability/namespace', 5, - 'Namespace should be terminated with "// namespace %s"' % - self.name) - else: - # Anonymous namespace - if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line): - # If "// namespace anonymous" or "// anonymous namespace (more text)", - # mention "// anonymous namespace" as an acceptable form - if Match(r'}.*\b(namespace anonymous|anonymous namespace)\b', line): - error(filename, linenum, 'readability/namespace', 5, - 'Anonymous namespace should be terminated with "// namespace"' - ' or "// anonymous namespace"') + """Stores information about a namespace.""" + + def __init__(self, name, linenum): + _BlockInfo.__init__(self, False) + self.name = name or '' + self.starting_linenum = linenum + self.check_namespace_indentation = True + + def CheckEnd(self, filename, clean_lines, linenum, error): + """Check end of namespace comments.""" + line = clean_lines.raw_lines[linenum] + + # Check how many lines is enclosed in this namespace. Don't issue + # warning for missing namespace comments if there aren't enough + # lines. However, do apply checks if there is already an end of + # namespace comment and it's incorrect. + # + # TODO(unknown): We always want to check end of namespace comments + # if a namespace is large, but sometimes we also want to apply the + # check if a short namespace contained nontrivial things (something + # other than forward declarations). There is currently no logic on + # deciding what these nontrivial things are, so this check is + # triggered by namespace size only, which works most of the time. + if (linenum - self.starting_linenum < 10 and + not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)): + return + + # Look for matching comment at end of namespace. + # + # Note that we accept C style "/* */" comments for terminating + # namespaces, so that code that terminate namespaces inside + # preprocessor macros can be cpplint clean. + # + # We also accept stuff like "// end of namespace ." with the + # period at the end. + # + # Besides these, we don't accept anything else, otherwise we might + # get false negatives when existing comment is a substring of the + # expected namespace. + if self.name: + # Named namespace + if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + + re.escape(self.name) + r'[\*/\.\\\s]*$'), line): + error(filename, linenum, 'readability/namespace', 5, + 'Namespace should be terminated with "// namespace %s"' % + self.name) else: - error(filename, linenum, 'readability/namespace', 5, - 'Anonymous namespace should be terminated with "// namespace"') + # Anonymous namespace + if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line): + # If "// namespace anonymous" or "// anonymous namespace (more text)", + # mention "// anonymous namespace" as an acceptable form + if Match(r'}.*\b(namespace anonymous|anonymous namespace)\b', + line): + error( + filename, linenum, 'readability/namespace', 5, + 'Anonymous namespace should be terminated with "// namespace"' + ' or "// anonymous namespace"') + else: + error( + filename, linenum, 'readability/namespace', 5, + 'Anonymous namespace should be terminated with "// namespace"' + ) class _PreprocessorInfo(object): - """Stores checkpoints of nesting stacks when #if/#else is seen.""" + """Stores checkpoints of nesting stacks when #if/#else is seen.""" - def __init__(self, stack_before_if): - # The entire nesting stack before #if - self.stack_before_if = stack_before_if + def __init__(self, stack_before_if): + # The entire nesting stack before #if + self.stack_before_if = stack_before_if - # The entire nesting stack up to #else - self.stack_before_else = [] + # The entire nesting stack up to #else + self.stack_before_else = [] - # Whether we have already seen #else or #elif - self.seen_else = False + # Whether we have already seen #else or #elif + self.seen_else = False class NestingState(object): - """Holds states related to parsing braces.""" - - def __init__(self): - # Stack for tracking all braces. An object is pushed whenever we - # see a "{", and popped when we see a "}". Only 3 types of - # objects are possible: - # - _ClassInfo: a class or struct. - # - _NamespaceInfo: a namespace. - # - _BlockInfo: some other type of block. - self.stack = [] - - # Top of the previous stack before each Update(). - # - # Because the nesting_stack is updated at the end of each line, we - # had to do some convoluted checks to find out what is the current - # scope at the beginning of the line. This check is simplified by - # saving the previous top of nesting stack. - # - # We could save the full stack, but we only need the top. Copying - # the full nesting stack would slow down cpplint by ~10%. - self.previous_stack_top = [] + """Holds states related to parsing braces.""" + + def __init__(self): + # Stack for tracking all braces. An object is pushed whenever we + # see a "{", and popped when we see a "}". Only 3 types of + # objects are possible: + # - _ClassInfo: a class or struct. + # - _NamespaceInfo: a namespace. + # - _BlockInfo: some other type of block. + self.stack = [] + + # Top of the previous stack before each Update(). + # + # Because the nesting_stack is updated at the end of each line, we + # had to do some convoluted checks to find out what is the current + # scope at the beginning of the line. This check is simplified by + # saving the previous top of nesting stack. + # + # We could save the full stack, but we only need the top. Copying + # the full nesting stack would slow down cpplint by ~10%. + self.previous_stack_top = [] - # Stack of _PreprocessorInfo objects. - self.pp_stack = [] + # Stack of _PreprocessorInfo objects. + self.pp_stack = [] - def SeenOpenBrace(self): - """Check if we have seen the opening brace for the innermost block. + def SeenOpenBrace(self): + """Check if we have seen the opening brace for the innermost block. Returns: True if we have seen the opening brace, False if the innermost block is still expecting an opening brace. """ - return (not self.stack) or self.stack[-1].seen_open_brace + return (not self.stack) or self.stack[-1].seen_open_brace - def InNamespaceBody(self): - """Check if we are currently one level inside a namespace body. + def InNamespaceBody(self): + """Check if we are currently one level inside a namespace body. Returns: True if top of the stack is a namespace block, False otherwise. """ - return self.stack and isinstance(self.stack[-1], _NamespaceInfo) + return self.stack and isinstance(self.stack[-1], _NamespaceInfo) - def InExternC(self): - """Check if we are currently one level inside an 'extern "C"' block. + def InExternC(self): + """Check if we are currently one level inside an 'extern "C"' block. Returns: True if top of the stack is an extern block, False otherwise. """ - return self.stack and isinstance(self.stack[-1], _ExternCInfo) + return self.stack and isinstance(self.stack[-1], _ExternCInfo) - def InClassDeclaration(self): - """Check if we are currently one level inside a class or struct declaration. + def InClassDeclaration(self): + """Check if we are currently one level inside a class or struct declaration. Returns: True if top of the stack is a class/struct, False otherwise. """ - return self.stack and isinstance(self.stack[-1], _ClassInfo) + return self.stack and isinstance(self.stack[-1], _ClassInfo) - def InAsmBlock(self): - """Check if we are currently one level inside an inline ASM block. + def InAsmBlock(self): + """Check if we are currently one level inside an inline ASM block. Returns: True if the top of the stack is a block containing inline ASM. """ - return self.stack and self.stack[-1].inline_asm != _NO_ASM + return self.stack and self.stack[-1].inline_asm != _NO_ASM - def InTemplateArgumentList(self, clean_lines, linenum, pos): - """Check if current position is inside template argument list. + def InTemplateArgumentList(self, clean_lines, linenum, pos): + """Check if current position is inside template argument list. Args: clean_lines: A CleansedLines instance containing the file. @@ -2282,50 +2295,51 @@ class NestingState(object): Returns: True if (linenum, pos) is inside template arguments. """ - while linenum < clean_lines.NumLines(): - # Find the earliest character that might indicate a template argument - line = clean_lines.elided[linenum] - match = Match(r'^[^{};=\[\]\.<>]*(.)', line[pos:]) - if not match: - linenum += 1 - pos = 0 - continue - token = match.group(1) - pos += len(match.group(0)) - - # These things do not look like template argument list: - # class Suspect { - # class Suspect x; } - if token in ('{', '}', ';'): return False - - # These things look like template argument list: - # template - # template - # template - # template - if token in ('>', '=', '[', ']', '.'): return True - - # Check if token is an unmatched '<'. - # If not, move on to the next character. - if token != '<': - pos += 1 - if pos >= len(line): - linenum += 1 - pos = 0 - continue - - # We can't be sure if we just find a single '<', and need to - # find the matching '>'. - (_, end_line, end_pos) = CloseExpression(clean_lines, linenum, pos - 1) - if end_pos < 0: - # Not sure if template argument list or syntax error in file + while linenum < clean_lines.NumLines(): + # Find the earliest character that might indicate a template argument + line = clean_lines.elided[linenum] + match = Match(r'^[^{};=\[\]\.<>]*(.)', line[pos:]) + if not match: + linenum += 1 + pos = 0 + continue + token = match.group(1) + pos += len(match.group(0)) + + # These things do not look like template argument list: + # class Suspect { + # class Suspect x; } + if token in ('{', '}', ';'): return False + + # These things look like template argument list: + # template + # template + # template + # template + if token in ('>', '=', '[', ']', '.'): return True + + # Check if token is an unmatched '<'. + # If not, move on to the next character. + if token != '<': + pos += 1 + if pos >= len(line): + linenum += 1 + pos = 0 + continue + + # We can't be sure if we just find a single '<', and need to + # find the matching '>'. + (_, end_line, end_pos) = CloseExpression(clean_lines, linenum, + pos - 1) + if end_pos < 0: + # Not sure if template argument list or syntax error in file + return False + linenum = end_line + pos = end_pos return False - linenum = end_line - pos = end_pos - return False - def UpdatePreprocessor(self, line): - """Update preprocessor stack. + def UpdatePreprocessor(self, line): + """Update preprocessor stack. We need to handle preprocessors due to classes like this: #ifdef SWIG @@ -2345,44 +2359,45 @@ class NestingState(object): Args: line: current line to check. """ - if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line): - # Beginning of #if block, save the nesting stack here. The saved - # stack will allow us to restore the parsing state in the #else case. - self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack))) - elif Match(r'^\s*#\s*(else|elif)\b', line): - # Beginning of #else block - if self.pp_stack: - if not self.pp_stack[-1].seen_else: - # This is the first #else or #elif block. Remember the - # whole nesting stack up to this point. This is what we - # keep after the #endif. - self.pp_stack[-1].seen_else = True - self.pp_stack[-1].stack_before_else = copy.deepcopy(self.stack) - - # Restore the stack to how it was before the #if - self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if) - else: - # TODO(unknown): unexpected #else, issue warning? - pass - elif Match(r'^\s*#\s*endif\b', line): - # End of #if or #else blocks. - if self.pp_stack: - # If we saw an #else, we will need to restore the nesting - # stack to its former state before the #else, otherwise we - # will just continue from where we left off. - if self.pp_stack[-1].seen_else: - # Here we can just use a shallow copy since we are the last - # reference to it. - self.stack = self.pp_stack[-1].stack_before_else - # Drop the corresponding #if - self.pp_stack.pop() - else: - # TODO(unknown): unexpected #endif, issue warning? - pass - - # TODO(unknown): Update() is too long, but we will refactor later. - def Update(self, filename, clean_lines, linenum, error): - """Update nesting state with current line. + if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line): + # Beginning of #if block, save the nesting stack here. The saved + # stack will allow us to restore the parsing state in the #else case. + self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack))) + elif Match(r'^\s*#\s*(else|elif)\b', line): + # Beginning of #else block + if self.pp_stack: + if not self.pp_stack[-1].seen_else: + # This is the first #else or #elif block. Remember the + # whole nesting stack up to this point. This is what we + # keep after the #endif. + self.pp_stack[-1].seen_else = True + self.pp_stack[-1].stack_before_else = copy.deepcopy( + self.stack) + + # Restore the stack to how it was before the #if + self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if) + else: + # TODO(unknown): unexpected #else, issue warning? + pass + elif Match(r'^\s*#\s*endif\b', line): + # End of #if or #else blocks. + if self.pp_stack: + # If we saw an #else, we will need to restore the nesting + # stack to its former state before the #else, otherwise we + # will just continue from where we left off. + if self.pp_stack[-1].seen_else: + # Here we can just use a shallow copy since we are the last + # reference to it. + self.stack = self.pp_stack[-1].stack_before_else + # Drop the corresponding #if + self.pp_stack.pop() + else: + # TODO(unknown): unexpected #endif, issue warning? + pass + + # TODO(unknown): Update() is too long, but we will refactor later. + def Update(self, filename, clean_lines, linenum, error): + """Update nesting state with current line. Args: filename: The name of the current file. @@ -2390,198 +2405,201 @@ class NestingState(object): linenum: The number of the line to check. error: The function to call with any errors found. """ - line = clean_lines.elided[linenum] + line = clean_lines.elided[linenum] - # Remember top of the previous nesting stack. - # - # The stack is always pushed/popped and not modified in place, so - # we can just do a shallow copy instead of copy.deepcopy. Using - # deepcopy would slow down cpplint by ~28%. - if self.stack: - self.previous_stack_top = self.stack[-1] - else: - self.previous_stack_top = None - - # Update pp_stack - self.UpdatePreprocessor(line) - - # Count parentheses. This is to avoid adding struct arguments to - # the nesting stack. - if self.stack: - inner_block = self.stack[-1] - depth_change = line.count('(') - line.count(')') - inner_block.open_parentheses += depth_change - - # Also check if we are starting or ending an inline assembly block. - if inner_block.inline_asm in (_NO_ASM, _END_ASM): - if (depth_change != 0 and - inner_block.open_parentheses == 1 and - _MATCH_ASM.match(line)): - # Enter assembly block - inner_block.inline_asm = _INSIDE_ASM - else: - # Not entering assembly block. If previous line was _END_ASM, - # we will now shift to _NO_ASM state. - inner_block.inline_asm = _NO_ASM - elif (inner_block.inline_asm == _INSIDE_ASM and - inner_block.open_parentheses == 0): - # Exit assembly block - inner_block.inline_asm = _END_ASM - - # Consume namespace declaration at the beginning of the line. Do - # this in a loop so that we catch same line declarations like this: - # namespace proto2 { namespace bridge { class MessageSet; } } - while True: - # Match start of namespace. The "\b\s*" below catches namespace - # declarations even if it weren't followed by a whitespace, this - # is so that we don't confuse our namespace checker. The - # missing spaces will be flagged by CheckSpacing. - namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$', line) - if not namespace_decl_match: - break - - new_namespace = _NamespaceInfo(namespace_decl_match.group(1), linenum) - self.stack.append(new_namespace) - - line = namespace_decl_match.group(2) - if line.find('{') != -1: - new_namespace.seen_open_brace = True - line = line[line.find('{') + 1:] - - # Look for a class declaration in whatever is left of the line - # after parsing namespaces. The regexp accounts for decorated classes - # such as in: - # class LOCKABLE API Object { - # }; - class_decl_match = Match( - r'^(\s*(?:template\s*<[\w\s<>,:]*>\s*)?' - r'(class|struct)\s+(?:[A-Z_]+\s+)*(\w+(?:::\w+)*))' - r'(.*)$', line) - if (class_decl_match and - (not self.stack or self.stack[-1].open_parentheses == 0)): - # We do not want to accept classes that are actually template arguments: - # template , - # template class Ignore3> - # void Function() {}; - # - # To avoid template argument cases, we scan forward and look for - # an unmatched '>'. If we see one, assume we are inside a - # template argument list. - end_declaration = len(class_decl_match.group(1)) - if not self.InTemplateArgumentList(clean_lines, linenum, end_declaration): - self.stack.append(_ClassInfo( - class_decl_match.group(3), class_decl_match.group(2), - clean_lines, linenum)) - line = class_decl_match.group(4) - - # If we have not yet seen the opening brace for the innermost block, - # run checks here. - if not self.SeenOpenBrace(): - self.stack[-1].CheckBegin(filename, clean_lines, linenum, error) - - # Update access control if we are inside a class/struct - if self.stack and isinstance(self.stack[-1], _ClassInfo): - classinfo = self.stack[-1] - access_match = Match( - r'^(.*)\b(public|private|protected|signals)(\s+(?:slots\s*)?)?' - r':(?:[^:]|$)', - line) - if access_match: - classinfo.access = access_match.group(2) - - # Check that access keywords are indented +1 space. Skip this - # check if the keywords are not preceded by whitespaces. - indent = access_match.group(1) - if (len(indent) != classinfo.class_indent + 1 and - Match(r'^\s*$', indent)): - if classinfo.is_struct: - parent = 'struct ' + classinfo.name - else: - parent = 'class ' + classinfo.name - slots = '' - if access_match.group(3): - slots = access_match.group(3) - error(filename, linenum, 'whitespace/indent', 3, - '%s%s: should be indented +1 space inside %s' % ( - access_match.group(2), slots, parent)) - - # Consume braces or semicolons from what's left of the line - while True: - # Match first brace, semicolon, or closed parenthesis. - matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line) - if not matched: - break - - token = matched.group(1) - if token == '{': - # If namespace or class hasn't seen a opening brace yet, mark - # namespace/class head as complete. Push a new block onto the - # stack otherwise. - if not self.SeenOpenBrace(): - self.stack[-1].seen_open_brace = True - elif Match(r'^extern\s*"[^"]*"\s*\{', line): - self.stack.append(_ExternCInfo()) - else: - self.stack.append(_BlockInfo(True)) - if _MATCH_ASM.match(line): - self.stack[-1].inline_asm = _BLOCK_ASM - - elif token == ';' or token == ')': - # If we haven't seen an opening brace yet, but we already saw - # a semicolon, this is probably a forward declaration. Pop - # the stack for these. + # Remember top of the previous nesting stack. # - # Similarly, if we haven't seen an opening brace yet, but we - # already saw a closing parenthesis, then these are probably - # function arguments with extra "class" or "struct" keywords. - # Also pop these stack for these. - if not self.SeenOpenBrace(): - self.stack.pop() - else: # token == '}' - # Perform end of block checks and pop the stack. + # The stack is always pushed/popped and not modified in place, so + # we can just do a shallow copy instead of copy.deepcopy. Using + # deepcopy would slow down cpplint by ~28%. if self.stack: - self.stack[-1].CheckEnd(filename, clean_lines, linenum, error) - self.stack.pop() - line = matched.group(2) + self.previous_stack_top = self.stack[-1] + else: + self.previous_stack_top = None + + # Update pp_stack + self.UpdatePreprocessor(line) - def InnermostClass(self): - """Get class info on the top of the stack. + # Count parentheses. This is to avoid adding struct arguments to + # the nesting stack. + if self.stack: + inner_block = self.stack[-1] + depth_change = line.count('(') - line.count(')') + inner_block.open_parentheses += depth_change + + # Also check if we are starting or ending an inline assembly block. + if inner_block.inline_asm in (_NO_ASM, _END_ASM): + if (depth_change != 0 and inner_block.open_parentheses == 1 and + _MATCH_ASM.match(line)): + # Enter assembly block + inner_block.inline_asm = _INSIDE_ASM + else: + # Not entering assembly block. If previous line was _END_ASM, + # we will now shift to _NO_ASM state. + inner_block.inline_asm = _NO_ASM + elif (inner_block.inline_asm == _INSIDE_ASM and + inner_block.open_parentheses == 0): + # Exit assembly block + inner_block.inline_asm = _END_ASM + + # Consume namespace declaration at the beginning of the line. Do + # this in a loop so that we catch same line declarations like this: + # namespace proto2 { namespace bridge { class MessageSet; } } + while True: + # Match start of namespace. The "\b\s*" below catches namespace + # declarations even if it weren't followed by a whitespace, this + # is so that we don't confuse our namespace checker. The + # missing spaces will be flagged by CheckSpacing. + namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$', + line) + if not namespace_decl_match: + break + + new_namespace = _NamespaceInfo( + namespace_decl_match.group(1), linenum) + self.stack.append(new_namespace) + + line = namespace_decl_match.group(2) + if line.find('{') != -1: + new_namespace.seen_open_brace = True + line = line[line.find('{') + 1:] + + # Look for a class declaration in whatever is left of the line + # after parsing namespaces. The regexp accounts for decorated classes + # such as in: + # class LOCKABLE API Object { + # }; + class_decl_match = Match( + r'^(\s*(?:template\s*<[\w\s<>,:]*>\s*)?' + r'(class|struct)\s+(?:[A-Z_]+\s+)*(\w+(?:::\w+)*))' + r'(.*)$', line) + if (class_decl_match and + (not self.stack or self.stack[-1].open_parentheses == 0)): + # We do not want to accept classes that are actually template arguments: + # template , + # template class Ignore3> + # void Function() {}; + # + # To avoid template argument cases, we scan forward and look for + # an unmatched '>'. If we see one, assume we are inside a + # template argument list. + end_declaration = len(class_decl_match.group(1)) + if not self.InTemplateArgumentList(clean_lines, linenum, + end_declaration): + self.stack.append( + _ClassInfo( + class_decl_match.group(3), + class_decl_match.group(2), clean_lines, linenum)) + line = class_decl_match.group(4) + + # If we have not yet seen the opening brace for the innermost block, + # run checks here. + if not self.SeenOpenBrace(): + self.stack[-1].CheckBegin(filename, clean_lines, linenum, error) + + # Update access control if we are inside a class/struct + if self.stack and isinstance(self.stack[-1], _ClassInfo): + classinfo = self.stack[-1] + access_match = Match( + r'^(.*)\b(public|private|protected|signals)(\s+(?:slots\s*)?)?' + r':(?:[^:]|$)', line) + if access_match: + classinfo.access = access_match.group(2) + + # Check that access keywords are indented +1 space. Skip this + # check if the keywords are not preceded by whitespaces. + indent = access_match.group(1) + if (len(indent) != classinfo.class_indent + 1 and + Match(r'^\s*$', indent)): + if classinfo.is_struct: + parent = 'struct ' + classinfo.name + else: + parent = 'class ' + classinfo.name + slots = '' + if access_match.group(3): + slots = access_match.group(3) + error(filename, linenum, 'whitespace/indent', 3, + '%s%s: should be indented +1 space inside %s' % ( + access_match.group(2), slots, parent)) + + # Consume braces or semicolons from what's left of the line + while True: + # Match first brace, semicolon, or closed parenthesis. + matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line) + if not matched: + break + + token = matched.group(1) + if token == '{': + # If namespace or class hasn't seen a opening brace yet, mark + # namespace/class head as complete. Push a new block onto the + # stack otherwise. + if not self.SeenOpenBrace(): + self.stack[-1].seen_open_brace = True + elif Match(r'^extern\s*"[^"]*"\s*\{', line): + self.stack.append(_ExternCInfo()) + else: + self.stack.append(_BlockInfo(True)) + if _MATCH_ASM.match(line): + self.stack[-1].inline_asm = _BLOCK_ASM + + elif token == ';' or token == ')': + # If we haven't seen an opening brace yet, but we already saw + # a semicolon, this is probably a forward declaration. Pop + # the stack for these. + # + # Similarly, if we haven't seen an opening brace yet, but we + # already saw a closing parenthesis, then these are probably + # function arguments with extra "class" or "struct" keywords. + # Also pop these stack for these. + if not self.SeenOpenBrace(): + self.stack.pop() + else: # token == '}' + # Perform end of block checks and pop the stack. + if self.stack: + self.stack[-1].CheckEnd(filename, clean_lines, linenum, + error) + self.stack.pop() + line = matched.group(2) + + def InnermostClass(self): + """Get class info on the top of the stack. Returns: A _ClassInfo object if we are inside a class, or None otherwise. """ - for i in range(len(self.stack), 0, -1): - classinfo = self.stack[i - 1] - if isinstance(classinfo, _ClassInfo): - return classinfo - return None + for i in range(len(self.stack), 0, -1): + classinfo = self.stack[i - 1] + if isinstance(classinfo, _ClassInfo): + return classinfo + return None - def CheckCompletedBlocks(self, filename, error): - """Checks that all classes and namespaces have been completely parsed. + def CheckCompletedBlocks(self, filename, error): + """Checks that all classes and namespaces have been completely parsed. Call this when all lines in a file have been processed. Args: filename: The name of the current file. error: The function to call with any errors found. """ - # Note: This test can result in false positives if #ifdef constructs - # get in the way of brace matching. See the testBuildClass test in - # cpplint_unittest.py for an example of this. - for obj in self.stack: - if isinstance(obj, _ClassInfo): - error(filename, obj.starting_linenum, 'build/class', 5, - 'Failed to find complete declaration of class %s' % - obj.name) - elif isinstance(obj, _NamespaceInfo): - error(filename, obj.starting_linenum, 'build/namespaces', 5, - 'Failed to find complete declaration of namespace %s' % - obj.name) - - -def CheckForNonStandardConstructs(filename, clean_lines, linenum, - nesting_state, error): - r"""Logs an error if we see certain non-ANSI constructs ignored by gcc-2. + # Note: This test can result in false positives if #ifdef constructs + # get in the way of brace matching. See the testBuildClass test in + # cpplint_unittest.py for an example of this. + for obj in self.stack: + if isinstance(obj, _ClassInfo): + error(filename, obj.starting_linenum, 'build/class', 5, + 'Failed to find complete declaration of class %s' % + obj.name) + elif isinstance(obj, _NamespaceInfo): + error(filename, obj.starting_linenum, 'build/namespaces', 5, + 'Failed to find complete declaration of namespace %s' % + obj.name) + + +def CheckForNonStandardConstructs(filename, clean_lines, linenum, nesting_state, + error): + r"""Logs an error if we see certain non-ANSI constructs ignored by gcc-2. Complain about several constructs which gcc-2 accepts, but which are not standard C++. Warning about these in lint is one way to ease the @@ -2608,143 +2626,144 @@ def CheckForNonStandardConstructs(filename, clean_lines, linenum, filename, line number, error level, and message """ - # Remove comments from the line, but leave in strings for now. - line = clean_lines.lines[linenum] - - if Search(r'printf\s*\(.*".*%[-+ ]?\d*q', line): - error(filename, linenum, 'runtime/printf_format', 3, - '%q in format strings is deprecated. Use %ll instead.') - - if Search(r'printf\s*\(.*".*%\d+\$', line): - error(filename, linenum, 'runtime/printf_format', 2, - '%N$ formats are unconventional. Try rewriting to avoid them.') - - # Remove escaped backslashes before looking for undefined escapes. - line = line.replace('\\\\', '') - - if Search(r'("|\').*\\(%|\[|\(|{)', line): - error(filename, linenum, 'build/printf_format', 3, - '%, [, (, and { are undefined character escapes. Unescape them.') - - # For the rest, work with both comments and strings removed. - line = clean_lines.elided[linenum] - - if Search(r'\b(const|volatile|void|char|short|int|long' - r'|float|double|signed|unsigned' - r'|schar|u?int8|u?int16|u?int32|u?int64)' - r'\s+(register|static|extern|typedef)\b', - line): - error(filename, linenum, 'build/storage_class', 5, - 'Storage class (static, extern, typedef, etc) should be first.') - - if Match(r'\s*#\s*endif\s*[^/\s]+', line): - error(filename, linenum, 'build/endif_comment', 5, - 'Uncommented text after #endif is non-standard. Use a comment.') - - if Match(r'\s*class\s+(\w+\s*::\s*)+\w+\s*;', line): - error(filename, linenum, 'build/forward_decl', 5, - 'Inner-style forward declarations are invalid. Remove this line.') - - if Search(r'(\w+|[+-]?\d+(\.\d*)?)\s*(<|>)\?=?\s*(\w+|[+-]?\d+)(\.\d*)?', - line): - error(filename, linenum, 'build/deprecated', 3, - '>? and ))?' - # r'\s*const\s*' + type_name + '\s*&\s*\w+\s*;' - error(filename, linenum, 'runtime/member_string_references', 2, - 'const string& members are dangerous. It is much better to use ' - 'alternatives, such as pointers or simple constants.') - - # Everything else in this function operates on class declarations. - # Return early if the top of the nesting stack is not a class, or if - # the class head is not completed yet. - classinfo = nesting_state.InnermostClass() - if not classinfo or not classinfo.seen_open_brace: - return - - # The class may have been declared with namespace or classname qualifiers. - # The constructor and destructor will not have those qualifiers. - base_classname = classinfo.name.split('::')[-1] - - # Look for single-argument constructors that aren't marked explicit. - # Technically a valid construct, but against style. Also look for - # non-single-argument constructors which are also technically valid, but - # strongly suggest something is wrong. - explicit_constructor_match = Match( - r'\s+(?:inline\s+)?(explicit\s+)?(?:inline\s+)?%s\s*' - r'\(((?:[^()]|\([^()]*\))*)\)' - % re.escape(base_classname), - line) - - if explicit_constructor_match: - is_marked_explicit = explicit_constructor_match.group(1) - - if not explicit_constructor_match.group(2): - constructor_args = [] - else: - constructor_args = explicit_constructor_match.group(2).split(',') - - # collapse arguments so that commas in template parameter lists and function - # argument parameter lists don't split arguments in two - i = 0 - while i < len(constructor_args): - constructor_arg = constructor_args[i] - while (constructor_arg.count('<') > constructor_arg.count('>') or - constructor_arg.count('(') > constructor_arg.count(')')): - constructor_arg += ',' + constructor_args[i + 1] - del constructor_args[i + 1] - constructor_args[i] = constructor_arg - i += 1 - - defaulted_args = [arg for arg in constructor_args if '=' in arg] - noarg_constructor = (not constructor_args or # empty arg list - # 'void' arg specifier - (len(constructor_args) == 1 and - constructor_args[0].strip() == 'void')) - onearg_constructor = ((len(constructor_args) == 1 and # exactly one arg - not noarg_constructor) or - # all but at most one arg defaulted - (len(constructor_args) >= 1 and - not noarg_constructor and - len(defaulted_args) >= len(constructor_args) - 1)) - initializer_list_constructor = bool( - onearg_constructor and - Search(r'\bstd\s*::\s*initializer_list\b', constructor_args[0])) - copy_constructor = bool( - onearg_constructor and - Match(r'(const\s+)?%s(\s*<[^>]*>)?(\s+const)?\s*(?:<\w+>\s*)?&' - % re.escape(base_classname), constructor_args[0].strip())) - - if (not is_marked_explicit and - onearg_constructor and - not initializer_list_constructor and - not copy_constructor): - if defaulted_args: - error(filename, linenum, 'runtime/explicit', 5, - 'Constructors callable with one argument ' - 'should be marked explicit.') - else: - error(filename, linenum, 'runtime/explicit', 5, - 'Single-parameter constructors should be marked explicit.') - elif is_marked_explicit and not onearg_constructor: - if noarg_constructor: - error(filename, linenum, 'runtime/explicit', 5, - 'Zero-parameter constructors should not be marked explicit.') - else: - error(filename, linenum, 'runtime/explicit', 0, - 'Constructors that require multiple arguments ' - 'should not be marked explicit.') + # Remove comments from the line, but leave in strings for now. + line = clean_lines.lines[linenum] + + if Search(r'printf\s*\(.*".*%[-+ ]?\d*q', line): + error(filename, linenum, 'runtime/printf_format', 3, + '%q in format strings is deprecated. Use %ll instead.') + + if Search(r'printf\s*\(.*".*%\d+\$', line): + error(filename, linenum, 'runtime/printf_format', 2, + '%N$ formats are unconventional. Try rewriting to avoid them.') + + # Remove escaped backslashes before looking for undefined escapes. + line = line.replace('\\\\', '') + + if Search(r'("|\').*\\(%|\[|\(|{)', line): + error(filename, linenum, 'build/printf_format', 3, + '%, [, (, and { are undefined character escapes. Unescape them.') + + # For the rest, work with both comments and strings removed. + line = clean_lines.elided[linenum] + + if Search(r'\b(const|volatile|void|char|short|int|long' + r'|float|double|signed|unsigned' + r'|schar|u?int8|u?int16|u?int32|u?int64)' + r'\s+(register|static|extern|typedef)\b', line): + error(filename, linenum, 'build/storage_class', 5, + 'Storage class (static, extern, typedef, etc) should be first.') + + if Match(r'\s*#\s*endif\s*[^/\s]+', line): + error(filename, linenum, 'build/endif_comment', 5, + 'Uncommented text after #endif is non-standard. Use a comment.') + + if Match(r'\s*class\s+(\w+\s*::\s*)+\w+\s*;', line): + error( + filename, linenum, 'build/forward_decl', 5, + 'Inner-style forward declarations are invalid. Remove this line.') + + if Search(r'(\w+|[+-]?\d+(\.\d*)?)\s*(<|>)\?=?\s*(\w+|[+-]?\d+)(\.\d*)?', + line): + error( + filename, linenum, 'build/deprecated', 3, + '>? and ))?' + # r'\s*const\s*' + type_name + '\s*&\s*\w+\s*;' + error(filename, linenum, 'runtime/member_string_references', 2, + 'const string& members are dangerous. It is much better to use ' + 'alternatives, such as pointers or simple constants.') + + # Everything else in this function operates on class declarations. + # Return early if the top of the nesting stack is not a class, or if + # the class head is not completed yet. + classinfo = nesting_state.InnermostClass() + if not classinfo or not classinfo.seen_open_brace: + return + + # The class may have been declared with namespace or classname qualifiers. + # The constructor and destructor will not have those qualifiers. + base_classname = classinfo.name.split('::')[-1] + + # Look for single-argument constructors that aren't marked explicit. + # Technically a valid construct, but against style. Also look for + # non-single-argument constructors which are also technically valid, but + # strongly suggest something is wrong. + explicit_constructor_match = Match( + r'\s+(?:inline\s+)?(explicit\s+)?(?:inline\s+)?%s\s*' + r'\(((?:[^()]|\([^()]*\))*)\)' % re.escape(base_classname), line) + + if explicit_constructor_match: + is_marked_explicit = explicit_constructor_match.group(1) + + if not explicit_constructor_match.group(2): + constructor_args = [] + else: + constructor_args = explicit_constructor_match.group(2).split(',') + + # collapse arguments so that commas in template parameter lists and function + # argument parameter lists don't split arguments in two + i = 0 + while i < len(constructor_args): + constructor_arg = constructor_args[i] + while (constructor_arg.count('<') > constructor_arg.count('>') or + constructor_arg.count('(') > constructor_arg.count(')')): + constructor_arg += ',' + constructor_args[i + 1] + del constructor_args[i + 1] + constructor_args[i] = constructor_arg + i += 1 + + defaulted_args = [arg for arg in constructor_args if '=' in arg] + noarg_constructor = ( + not constructor_args or # empty arg list + # 'void' arg specifier + (len(constructor_args) == 1 and + constructor_args[0].strip() == 'void')) + onearg_constructor = ( + ( + len(constructor_args) == 1 and # exactly one arg + not noarg_constructor) or + # all but at most one arg defaulted + (len(constructor_args) >= 1 and not noarg_constructor and + len(defaulted_args) >= len(constructor_args) - 1)) + initializer_list_constructor = bool( + onearg_constructor and + Search(r'\bstd\s*::\s*initializer_list\b', constructor_args[0])) + copy_constructor = bool( + onearg_constructor and + Match(r'(const\s+)?%s(\s*<[^>]*>)?(\s+const)?\s*(?:<\w+>\s*)?&' % + re.escape(base_classname), constructor_args[0].strip())) + + if (not is_marked_explicit and onearg_constructor and + not initializer_list_constructor and not copy_constructor): + if defaulted_args: + error(filename, linenum, 'runtime/explicit', 5, + 'Constructors callable with one argument ' + 'should be marked explicit.') + else: + error( + filename, linenum, 'runtime/explicit', 5, + 'Single-parameter constructors should be marked explicit.') + elif is_marked_explicit and not onearg_constructor: + if noarg_constructor: + error( + filename, linenum, 'runtime/explicit', 5, + 'Zero-parameter constructors should not be marked explicit.') + else: + error(filename, linenum, 'runtime/explicit', 0, + 'Constructors that require multiple arguments ' + 'should not be marked explicit.') def CheckSpacingForFunctionCall(filename, clean_lines, linenum, error): - """Checks for the correctness of various spacing around function calls. + """Checks for the correctness of various spacing around function calls. Args: filename: The name of the current file. @@ -2752,75 +2771,74 @@ def CheckSpacingForFunctionCall(filename, clean_lines, linenum, error): linenum: The number of the line to check. error: The function to call with any errors found. """ - line = clean_lines.elided[linenum] - - # Since function calls often occur inside if/for/while/switch - # expressions - which have their own, more liberal conventions - we - # first see if we should be looking inside such an expression for a - # function call, to which we can apply more strict standards. - fncall = line # if there's no control flow construct, look at whole line - for pattern in (r'\bif\s*\((.*)\)\s*{', - r'\bfor\s*\((.*)\)\s*{', - r'\bwhile\s*\((.*)\)\s*[{;]', - r'\bswitch\s*\((.*)\)\s*{'): - match = Search(pattern, line) - if match: - fncall = match.group(1) # look inside the parens for function calls - break - - # Except in if/for/while/switch, there should never be space - # immediately inside parens (eg "f( 3, 4 )"). We make an exception - # for nested parens ( (a+b) + c ). Likewise, there should never be - # a space before a ( when it's a function argument. I assume it's a - # function argument when the char before the whitespace is legal in - # a function name (alnum + _) and we're not starting a macro. Also ignore - # pointers and references to arrays and functions coz they're too tricky: - # we use a very simple way to recognize these: - # " (something)(maybe-something)" or - # " (something)(maybe-something," or - # " (something)[something]" - # Note that we assume the contents of [] to be short enough that - # they'll never need to wrap. - if ( # Ignore control structures. - not Search(r'\b(if|for|while|switch|return|new|delete|catch|sizeof)\b', - fncall) and - # Ignore pointers/references to functions. - not Search(r' \([^)]+\)\([^)]*(\)|,$)', fncall) and - # Ignore pointers/references to arrays. - not Search(r' \([^)]+\)\[[^\]]+\]', fncall)): - if Search(r'\w\s*\(\s(?!\s*\\$)', fncall): # a ( used for a fn call - error(filename, linenum, 'whitespace/parens', 4, - 'Extra space after ( in function call') - elif Search(r'\(\s+(?!(\s*\\)|\()', fncall): - error(filename, linenum, 'whitespace/parens', 2, - 'Extra space after (') - if (Search(r'\w\s+\(', fncall) and - not Search(r'#\s*define|typedef|using\s+\w+\s*=', fncall) and - not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall) and - not Search(r'\bcase\s+\(', fncall)): - # TODO(unknown): Space after an operator function seem to be a common - # error, silence those for now by restricting them to highest verbosity. - if Search(r'\boperator_*\b', line): - error(filename, linenum, 'whitespace/parens', 0, - 'Extra space before ( in function call') - else: - error(filename, linenum, 'whitespace/parens', 4, - 'Extra space before ( in function call') - # If the ) is followed only by a newline or a { + newline, assume it's - # part of a control statement (if/while/etc), and don't complain - if Search(r'[^)]\s+\)\s*[^{\s]', fncall): - # If the closing parenthesis is preceded by only whitespaces, - # try to give a more descriptive error message. - if Search(r'^\s+\)', fncall): - error(filename, linenum, 'whitespace/parens', 2, - 'Closing ) should be moved to the previous line') - else: - error(filename, linenum, 'whitespace/parens', 2, - 'Extra space before )') + line = clean_lines.elided[linenum] + + # Since function calls often occur inside if/for/while/switch + # expressions - which have their own, more liberal conventions - we + # first see if we should be looking inside such an expression for a + # function call, to which we can apply more strict standards. + fncall = line # if there's no control flow construct, look at whole line + for pattern in (r'\bif\s*\((.*)\)\s*{', r'\bfor\s*\((.*)\)\s*{', + r'\bwhile\s*\((.*)\)\s*[{;]', r'\bswitch\s*\((.*)\)\s*{'): + match = Search(pattern, line) + if match: + fncall = match.group(1) # look inside the parens for function calls + break + + # Except in if/for/while/switch, there should never be space + # immediately inside parens (eg "f( 3, 4 )"). We make an exception + # for nested parens ( (a+b) + c ). Likewise, there should never be + # a space before a ( when it's a function argument. I assume it's a + # function argument when the char before the whitespace is legal in + # a function name (alnum + _) and we're not starting a macro. Also ignore + # pointers and references to arrays and functions coz they're too tricky: + # we use a very simple way to recognize these: + # " (something)(maybe-something)" or + # " (something)(maybe-something," or + # " (something)[something]" + # Note that we assume the contents of [] to be short enough that + # they'll never need to wrap. + if ( # Ignore control structures. + not Search( + r'\b(if|for|while|switch|return|new|delete|catch|sizeof)\b', + fncall) and + # Ignore pointers/references to functions. + not Search(r' \([^)]+\)\([^)]*(\)|,$)', fncall) and + # Ignore pointers/references to arrays. + not Search(r' \([^)]+\)\[[^\]]+\]', fncall)): + if Search(r'\w\s*\(\s(?!\s*\\$)', fncall): # a ( used for a fn call + error(filename, linenum, 'whitespace/parens', 4, + 'Extra space after ( in function call') + elif Search(r'\(\s+(?!(\s*\\)|\()', fncall): + error(filename, linenum, 'whitespace/parens', 2, + 'Extra space after (') + if (Search(r'\w\s+\(', fncall) and + not Search(r'#\s*define|typedef|using\s+\w+\s*=', fncall) and + not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall) and + not Search(r'\bcase\s+\(', fncall)): + # TODO(unknown): Space after an operator function seem to be a common + # error, silence those for now by restricting them to highest verbosity. + if Search(r'\boperator_*\b', line): + error(filename, linenum, 'whitespace/parens', 0, + 'Extra space before ( in function call') + else: + error(filename, linenum, 'whitespace/parens', 4, + 'Extra space before ( in function call') + # If the ) is followed only by a newline or a { + newline, assume it's + # part of a control statement (if/while/etc), and don't complain + if Search(r'[^)]\s+\)\s*[^{\s]', fncall): + # If the closing parenthesis is preceded by only whitespaces, + # try to give a more descriptive error message. + if Search(r'^\s+\)', fncall): + error(filename, linenum, 'whitespace/parens', 2, + 'Closing ) should be moved to the previous line') + else: + error(filename, linenum, 'whitespace/parens', 2, + 'Extra space before )') def IsBlankLine(line): - """Returns true if the given line is blank. + """Returns true if the given line is blank. We consider a line to be blank if the line is empty or consists of only white spaces. @@ -2831,26 +2849,26 @@ def IsBlankLine(line): Returns: True, if the given line is blank. """ - return not line or line.isspace() + return not line or line.isspace() def CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line, error): - is_namespace_indent_item = ( - len(nesting_state.stack) > 1 and - nesting_state.stack[-1].check_namespace_indentation and - isinstance(nesting_state.previous_stack_top, _NamespaceInfo) and - nesting_state.previous_stack_top == nesting_state.stack[-2]) + is_namespace_indent_item = ( + len(nesting_state.stack) > 1 and + nesting_state.stack[-1].check_namespace_indentation and + isinstance(nesting_state.previous_stack_top, _NamespaceInfo) and + nesting_state.previous_stack_top == nesting_state.stack[-2]) - if ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item, - clean_lines.elided, line): - CheckItemIndentationInNamespace(filename, clean_lines.elided, - line, error) + if ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item, + clean_lines.elided, line): + CheckItemIndentationInNamespace(filename, clean_lines.elided, line, + error) -def CheckForFunctionLengths(filename, clean_lines, linenum, - function_state, error): - """Reports for long function bodies. +def CheckForFunctionLengths(filename, clean_lines, linenum, function_state, + error): + """Reports for long function bodies. For an overview why this is done, see: http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions @@ -2871,56 +2889,57 @@ def CheckForFunctionLengths(filename, clean_lines, linenum, function_state: Current function name and lines in body so far. error: The function to call with any errors found. """ - lines = clean_lines.lines - line = lines[linenum] - joined_line = '' - - starting_func = False - regexp = r'(\w(\w|::|\*|\&|\s)*)\(' # decls * & space::name( ... - match_result = Match(regexp, line) - if match_result: - # If the name is all caps and underscores, figure it's a macro and - # ignore it, unless it's TEST or TEST_F. - function_name = match_result.group(1).split()[-1] - if function_name == 'TEST' or function_name == 'TEST_F' or ( - not Match(r'[A-Z_]+$', function_name)): - starting_func = True - - if starting_func: - body_found = False - for start_linenum in xrange(linenum, clean_lines.NumLines()): - start_line = lines[start_linenum] - joined_line += ' ' + start_line.lstrip() - if Search(r'(;|})', start_line): # Declarations and trivial functions - body_found = True - break # ... ignore - elif Search(r'{', start_line): - body_found = True - function = Search(r'((\w|:)*)\(', line).group(1) - if Match(r'TEST', function): # Handle TEST... macros - parameter_regexp = Search(r'(\(.*\))', joined_line) - if parameter_regexp: # Ignore bad syntax - function += parameter_regexp.group(1) - else: - function += '()' - function_state.Begin(function) - break - if not body_found: - # No body for the function (or evidence of a non-function) was found. - error(filename, linenum, 'readability/fn_size', 5, - 'Lint failed to find start of function body.') - elif Match(r'^\}\s*$', line): # function end - function_state.Check(error, filename, linenum) - function_state.End() - elif not Match(r'^\s*$', line): - function_state.Count() # Count non-blank/non-comment lines. + lines = clean_lines.lines + line = lines[linenum] + joined_line = '' + + starting_func = False + regexp = r'(\w(\w|::|\*|\&|\s)*)\(' # decls * & space::name( ... + match_result = Match(regexp, line) + if match_result: + # If the name is all caps and underscores, figure it's a macro and + # ignore it, unless it's TEST or TEST_F. + function_name = match_result.group(1).split()[-1] + if function_name == 'TEST' or function_name == 'TEST_F' or ( + not Match(r'[A-Z_]+$', function_name)): + starting_func = True + + if starting_func: + body_found = False + for start_linenum in xrange(linenum, clean_lines.NumLines()): + start_line = lines[start_linenum] + joined_line += ' ' + start_line.lstrip() + if Search(r'(;|})', + start_line): # Declarations and trivial functions + body_found = True + break # ... ignore + elif Search(r'{', start_line): + body_found = True + function = Search(r'((\w|:)*)\(', line).group(1) + if Match(r'TEST', function): # Handle TEST... macros + parameter_regexp = Search(r'(\(.*\))', joined_line) + if parameter_regexp: # Ignore bad syntax + function += parameter_regexp.group(1) + else: + function += '()' + function_state.Begin(function) + break + if not body_found: + # No body for the function (or evidence of a non-function) was found. + error(filename, linenum, 'readability/fn_size', 5, + 'Lint failed to find start of function body.') + elif Match(r'^\}\s*$', line): # function end + function_state.Check(error, filename, linenum) + function_state.End() + elif not Match(r'^\s*$', line): + function_state.Count() # Count non-blank/non-comment lines. _RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?') def CheckComment(line, filename, linenum, next_line_start, error): - """Checks for common mistakes in comments. + """Checks for common mistakes in comments. Args: line: The line in question. @@ -2929,54 +2948,54 @@ def CheckComment(line, filename, linenum, next_line_start, error): next_line_start: The first non-whitespace column of the next line. error: The function to call with any errors found. """ - commentpos = line.find('//') - if commentpos != -1: - # Check if the // may be in quotes. If so, ignore it - # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison - if (line.count('"', 0, commentpos) - - line.count('\\"', 0, commentpos)) % 2 == 0: # not in quotes - # Allow one space for new scopes, two spaces otherwise: - if (not (Match(r'^.*{ *//', line) and next_line_start == commentpos) and - ((commentpos >= 1 and - line[commentpos-1] not in string.whitespace) or - (commentpos >= 2 and - line[commentpos-2] not in string.whitespace))): - error(filename, linenum, 'whitespace/comments', 2, - 'At least two spaces is best between code and comments') - - # Checks for common mistakes in TODO comments. - comment = line[commentpos:] - match = _RE_PATTERN_TODO.match(comment) - if match: - # One whitespace is correct; zero whitespace is handled elsewhere. - leading_whitespace = match.group(1) - if len(leading_whitespace) > 1: - error(filename, linenum, 'whitespace/todo', 2, - 'Too many spaces before TODO') - - username = match.group(2) - if not username: - error(filename, linenum, 'readability/todo', 2, - 'Missing username in TODO; it should look like ' - '"// TODO(my_username): Stuff."') - - middle_whitespace = match.group(3) - # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison - if middle_whitespace != ' ' and middle_whitespace != '': - error(filename, linenum, 'whitespace/todo', 2, - 'TODO(my_username) should be followed by a space') - - # If the comment contains an alphanumeric character, there - # should be a space somewhere between it and the // unless - # it's a /// or //! Doxygen comment. - if (Match(r'//[^ ]*\w', comment) and - not Match(r'(///|//\!)(\s+|$)', comment)): - error(filename, linenum, 'whitespace/comments', 4, - 'Should have a space between // and comment') + commentpos = line.find('//') + if commentpos != -1: + # Check if the // may be in quotes. If so, ignore it + # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison + if (line.count('"', 0, commentpos) - line.count('\\"', 0, commentpos) + ) % 2 == 0: # not in quotes + # Allow one space for new scopes, two spaces otherwise: + if (not (Match(r'^.*{ *//', line) and next_line_start == commentpos) + and ((commentpos >= 1 and + line[commentpos - 1] not in string.whitespace) or + (commentpos >= 2 and + line[commentpos - 2] not in string.whitespace))): + error(filename, linenum, 'whitespace/comments', 2, + 'At least two spaces is best between code and comments') + + # Checks for common mistakes in TODO comments. + comment = line[commentpos:] + match = _RE_PATTERN_TODO.match(comment) + if match: + # One whitespace is correct; zero whitespace is handled elsewhere. + leading_whitespace = match.group(1) + if len(leading_whitespace) > 1: + error(filename, linenum, 'whitespace/todo', 2, + 'Too many spaces before TODO') + + username = match.group(2) + if not username: + error(filename, linenum, 'readability/todo', 2, + 'Missing username in TODO; it should look like ' + '"// TODO(my_username): Stuff."') + + middle_whitespace = match.group(3) + # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison + if middle_whitespace != ' ' and middle_whitespace != '': + error(filename, linenum, 'whitespace/todo', 2, + 'TODO(my_username) should be followed by a space') + + # If the comment contains an alphanumeric character, there + # should be a space somewhere between it and the // unless + # it's a /// or //! Doxygen comment. + if (Match(r'//[^ ]*\w', comment) and + not Match(r'(///|//\!)(\s+|$)', comment)): + error(filename, linenum, 'whitespace/comments', 4, + 'Should have a space between // and comment') def CheckAccess(filename, clean_lines, linenum, nesting_state, error): - """Checks for improper use of DISALLOW* macros. + """Checks for improper use of DISALLOW* macros. Args: filename: The name of the current file. @@ -2986,27 +3005,27 @@ def CheckAccess(filename, clean_lines, linenum, nesting_state, error): the current stack of nested blocks being parsed. error: The function to call with any errors found. """ - line = clean_lines.elided[linenum] # get rid of comments and strings - - matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|' - r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line) - if not matched: - return - if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo): - if nesting_state.stack[-1].access != 'private': - error(filename, linenum, 'readability/constructors', 3, - '%s must be in the private: section' % matched.group(1)) - - else: - # Found DISALLOW* macro outside a class declaration, or perhaps it - # was used inside a function when it should have been part of the - # class declaration. We could issue a warning here, but it - # probably resulted in a compiler error already. - pass + line = clean_lines.elided[linenum] # get rid of comments and strings + + matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|' + r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line) + if not matched: + return + if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo): + if nesting_state.stack[-1].access != 'private': + error(filename, linenum, 'readability/constructors', 3, + '%s must be in the private: section' % matched.group(1)) + + else: + # Found DISALLOW* macro outside a class declaration, or perhaps it + # was used inside a function when it should have been part of the + # class declaration. We could issue a warning here, but it + # probably resulted in a compiler error already. + pass def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): - """Checks for the correctness of various spacing issues in the code. + """Checks for the correctness of various spacing issues in the code. Things we check for: spaces around operators, spaces after if/for/while/switch, no spaces around parens in function calls, two @@ -3023,118 +3042,114 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): error: The function to call with any errors found. """ - # Don't use "elided" lines here, otherwise we can't check commented lines. - # Don't want to use "raw" either, because we don't want to check inside C++11 - # raw strings, - raw = clean_lines.lines_without_raw_strings - line = raw[linenum] - - # Before nixing comments, check if the line is blank for no good - # reason. This includes the first line after a block is opened, and - # blank lines at the end of a function (ie, right before a line like '}' - # - # Skip all the blank line checks if we are immediately inside a - # namespace body. In other words, don't issue blank line warnings - # for this block: - # namespace { - # - # } - # - # A warning about missing end of namespace comments will be issued instead. - # - # Also skip blank line checks for 'extern "C"' blocks, which are formatted - # like namespaces. - if (IsBlankLine(line) and - not nesting_state.InNamespaceBody() and - not nesting_state.InExternC()): - elided = clean_lines.elided - prev_line = elided[linenum - 1] - prevbrace = prev_line.rfind('{') - # TODO(unknown): Don't complain if line before blank line, and line after, - # both start with alnums and are indented the same amount. - # This ignores whitespace at the start of a namespace block - # because those are not usually indented. - if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1: - # OK, we have a blank line at the start of a code block. Before we - # complain, we check if it is an exception to the rule: The previous - # non-empty line has the parameters of a function header that are indented - # 4 spaces (because they did not fit in a 80 column line when placed on - # the same line as the function name). We also check for the case where - # the previous line is indented 6 spaces, which may happen when the - # initializers of a constructor do not fit into a 80 column line. - exception = False - if Match(r' {6}\w', prev_line): # Initializer list? - # We are looking for the opening column of initializer list, which - # should be indented 4 spaces to cause 6 space indentation afterwards. - search_position = linenum-2 - while (search_position >= 0 - and Match(r' {6}\w', elided[search_position])): - search_position -= 1 - exception = (search_position >= 0 - and elided[search_position][:5] == ' :') - else: - # Search for the function arguments or an initializer list. We use a - # simple heuristic here: If the line is indented 4 spaces; and we have a - # closing paren, without the opening paren, followed by an opening brace - # or colon (for initializer lists) we assume that it is the last line of - # a function header. If we have a colon indented 4 spaces, it is an - # initializer list. - exception = (Match(r' {4}\w[^\(]*\)\s*(const\s*)?(\{\s*$|:)', - prev_line) - or Match(r' {4}:', prev_line)) - - if not exception: - error(filename, linenum, 'whitespace/blank_line', 2, - 'Redundant blank line at the start of a code block ' - 'should be deleted.') - # Ignore blank lines at the end of a block in a long if-else - # chain, like this: - # if (condition1) { - # // Something followed by a blank line + # Don't use "elided" lines here, otherwise we can't check commented lines. + # Don't want to use "raw" either, because we don't want to check inside C++11 + # raw strings, + raw = clean_lines.lines_without_raw_strings + line = raw[linenum] + + # Before nixing comments, check if the line is blank for no good + # reason. This includes the first line after a block is opened, and + # blank lines at the end of a function (ie, right before a line like '}' + # + # Skip all the blank line checks if we are immediately inside a + # namespace body. In other words, don't issue blank line warnings + # for this block: + # namespace { # - # } else if (condition2) { - # // Something else # } + # + # A warning about missing end of namespace comments will be issued instead. + # + # Also skip blank line checks for 'extern "C"' blocks, which are formatted + # like namespaces. + if (IsBlankLine(line) and not nesting_state.InNamespaceBody() and + not nesting_state.InExternC()): + elided = clean_lines.elided + prev_line = elided[linenum - 1] + prevbrace = prev_line.rfind('{') + # TODO(unknown): Don't complain if line before blank line, and line after, + # both start with alnums and are indented the same amount. + # This ignores whitespace at the start of a namespace block + # because those are not usually indented. + if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1: + # OK, we have a blank line at the start of a code block. Before we + # complain, we check if it is an exception to the rule: The previous + # non-empty line has the parameters of a function header that are indented + # 4 spaces (because they did not fit in a 80 column line when placed on + # the same line as the function name). We also check for the case where + # the previous line is indented 6 spaces, which may happen when the + # initializers of a constructor do not fit into a 80 column line. + exception = False + if Match(r' {6}\w', prev_line): # Initializer list? + # We are looking for the opening column of initializer list, which + # should be indented 4 spaces to cause 6 space indentation afterwards. + search_position = linenum - 2 + while (search_position >= 0 and + Match(r' {6}\w', elided[search_position])): + search_position -= 1 + exception = (search_position >= 0 and + elided[search_position][:5] == ' :') + else: + # Search for the function arguments or an initializer list. We use a + # simple heuristic here: If the line is indented 4 spaces; and we have a + # closing paren, without the opening paren, followed by an opening brace + # or colon (for initializer lists) we assume that it is the last line of + # a function header. If we have a colon indented 4 spaces, it is an + # initializer list. + exception = (Match(r' {4}\w[^\(]*\)\s*(const\s*)?(\{\s*$|:)', + prev_line) or Match(r' {4}:', prev_line)) + + if not exception: + error(filename, linenum, 'whitespace/blank_line', 2, + 'Redundant blank line at the start of a code block ' + 'should be deleted.') + # Ignore blank lines at the end of a block in a long if-else + # chain, like this: + # if (condition1) { + # // Something followed by a blank line + # + # } else if (condition2) { + # // Something else + # } + if linenum + 1 < clean_lines.NumLines(): + next_line = raw[linenum + 1] + if (next_line and Match(r'\s*}', next_line) and + next_line.find('} else ') == -1): + error(filename, linenum, 'whitespace/blank_line', 3, + 'Redundant blank line at the end of a code block ' + 'should be deleted.') + + matched = Match(r'\s*(public|protected|private):', prev_line) + if matched: + error(filename, linenum, 'whitespace/blank_line', 3, + 'Do not leave a blank line after "%s:"' % matched.group(1)) + + # Next, check comments + next_line_start = 0 if linenum + 1 < clean_lines.NumLines(): - next_line = raw[linenum + 1] - if (next_line - and Match(r'\s*}', next_line) - and next_line.find('} else ') == -1): - error(filename, linenum, 'whitespace/blank_line', 3, - 'Redundant blank line at the end of a code block ' - 'should be deleted.') - - matched = Match(r'\s*(public|protected|private):', prev_line) - if matched: - error(filename, linenum, 'whitespace/blank_line', 3, - 'Do not leave a blank line after "%s:"' % matched.group(1)) - - # Next, check comments - next_line_start = 0 - if linenum + 1 < clean_lines.NumLines(): - next_line = raw[linenum + 1] - next_line_start = len(next_line) - len(next_line.lstrip()) - CheckComment(line, filename, linenum, next_line_start, error) + next_line = raw[linenum + 1] + next_line_start = len(next_line) - len(next_line.lstrip()) + CheckComment(line, filename, linenum, next_line_start, error) - # get rid of comments and strings - line = clean_lines.elided[linenum] + # get rid of comments and strings + line = clean_lines.elided[linenum] - # You shouldn't have spaces before your brackets, except maybe after - # 'delete []' or 'return []() {};' - if Search(r'\w\s+\[', line) and not Search(r'(?:delete|return)\s+\[', line): - error(filename, linenum, 'whitespace/braces', 5, - 'Extra space before [') + # You shouldn't have spaces before your brackets, except maybe after + # 'delete []' or 'return []() {};' + if Search(r'\w\s+\[', line) and not Search(r'(?:delete|return)\s+\[', line): + error(filename, linenum, 'whitespace/braces', 5, 'Extra space before [') - # In range-based for, we wanted spaces before and after the colon, but - # not around "::" tokens that might appear. - if (Search(r'for *\(.*[^:]:[^: ]', line) or - Search(r'for *\(.*[^: ]:[^:]', line)): - error(filename, linenum, 'whitespace/forcolon', 2, - 'Missing space around colon in range-based for loop') + # In range-based for, we wanted spaces before and after the colon, but + # not around "::" tokens that might appear. + if (Search(r'for *\(.*[^:]:[^: ]', line) or + Search(r'for *\(.*[^: ]:[^:]', line)): + error(filename, linenum, 'whitespace/forcolon', 2, + 'Missing space around colon in range-based for loop') def CheckOperatorSpacing(filename, clean_lines, linenum, error): - """Checks for horizontal spacing around operators. + """Checks for horizontal spacing around operators. Args: filename: The name of the current file. @@ -3142,114 +3157,116 @@ def CheckOperatorSpacing(filename, clean_lines, linenum, error): linenum: The number of the line to check. error: The function to call with any errors found. """ - line = clean_lines.elided[linenum] - - # Don't try to do spacing checks for operator methods. Do this by - # replacing the troublesome characters with something else, - # preserving column position for all other characters. - # - # The replacement is done repeatedly to avoid false positives from - # operators that call operators. - while True: - match = Match(r'^(.*\boperator\b)(\S+)(\s*\(.*)$', line) - if match: - line = match.group(1) + ('_' * len(match.group(2))) + match.group(3) - else: - break - - # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )". - # Otherwise not. Note we only check for non-spaces on *both* sides; - # sometimes people put non-spaces on one side when aligning ='s among - # many lines (not that this is behavior that I approve of...) - if ((Search(r'[\w.]=', line) or - Search(r'=[\w.]', line)) - and not Search(r'\b(if|while|for) ', line) - # Operators taken from [lex.operators] in C++11 standard. - and not Search(r'(>=|<=|==|!=|&=|\^=|\|=|\+=|\*=|\/=|\%=)', line) - and not Search(r'operator=', line)): - error(filename, linenum, 'whitespace/operators', 4, - 'Missing spaces around =') - - # It's ok not to have spaces around binary operators like + - * /, but if - # there's too little whitespace, we get concerned. It's hard to tell, - # though, so we punt on this one for now. TODO. - - # You should always have whitespace around binary operators. - # - # Check <= and >= first to avoid false positives with < and >, then - # check non-include lines for spacing around < and >. - # - # If the operator is followed by a comma, assume it's be used in a - # macro context and don't do any checks. This avoids false - # positives. - # - # Note that && is not included here. Those are checked separately - # in CheckRValueReference - match = Search(r'[^<>=!\s](==|!=|<=|>=|\|\|)[^<>=!\s,;\)]', line) - if match: - error(filename, linenum, 'whitespace/operators', 3, - 'Missing spaces around %s' % match.group(1)) - elif not Match(r'#.*include', line): - # Look for < that is not surrounded by spaces. This is only - # triggered if both sides are missing spaces, even though - # technically should should flag if at least one side is missing a - # space. This is done to avoid some false positives with shifts. - match = Match(r'^(.*[^\s<])<[^\s=<,]', line) + line = clean_lines.elided[linenum] + + # Don't try to do spacing checks for operator methods. Do this by + # replacing the troublesome characters with something else, + # preserving column position for all other characters. + # + # The replacement is done repeatedly to avoid false positives from + # operators that call operators. + while True: + match = Match(r'^(.*\boperator\b)(\S+)(\s*\(.*)$', line) + if match: + line = match.group(1) + ('_' * len(match.group(2))) + match.group(3) + else: + break + + # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )". + # Otherwise not. Note we only check for non-spaces on *both* sides; + # sometimes people put non-spaces on one side when aligning ='s among + # many lines (not that this is behavior that I approve of...) + if ((Search(r'[\w.]=', line) or + Search(r'=[\w.]', line)) and not Search(r'\b(if|while|for) ', line) + # Operators taken from [lex.operators] in C++11 standard. + and + not Search(r'(>=|<=|==|!=|&=|\^=|\|=|\+=|\*=|\/=|\%=)', line) and + not Search(r'operator=', line)): + error(filename, linenum, 'whitespace/operators', 4, + 'Missing spaces around =') + + # It's ok not to have spaces around binary operators like + - * /, but if + # there's too little whitespace, we get concerned. It's hard to tell, + # though, so we punt on this one for now. TODO. + + # You should always have whitespace around binary operators. + # + # Check <= and >= first to avoid false positives with < and >, then + # check non-include lines for spacing around < and >. + # + # If the operator is followed by a comma, assume it's be used in a + # macro context and don't do any checks. This avoids false + # positives. + # + # Note that && is not included here. Those are checked separately + # in CheckRValueReference + match = Search(r'[^<>=!\s](==|!=|<=|>=|\|\|)[^<>=!\s,;\)]', line) if match: - (_, _, end_pos) = CloseExpression( - clean_lines, linenum, len(match.group(1))) - if end_pos <= -1: error(filename, linenum, 'whitespace/operators', 3, - 'Missing spaces around <') + 'Missing spaces around %s' % match.group(1)) + elif not Match(r'#.*include', line): + # Look for < that is not surrounded by spaces. This is only + # triggered if both sides are missing spaces, even though + # technically should should flag if at least one side is missing a + # space. This is done to avoid some false positives with shifts. + match = Match(r'^(.*[^\s<])<[^\s=<,]', line) + if match: + (_, _, end_pos) = CloseExpression(clean_lines, linenum, + len(match.group(1))) + if end_pos <= -1: + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around <') + + # Look for > that is not surrounded by spaces. Similar to the + # above, we only trigger if both sides are missing spaces to avoid + # false positives with shifts. + match = Match(r'^(.*[^-\s>])>[^\s=>,]', line) + if match: + (_, _, start_pos) = ReverseCloseExpression(clean_lines, linenum, + len(match.group(1))) + if start_pos <= -1: + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around >') + + # We allow no-spaces around << when used like this: 10<<20, but + # not otherwise (particularly, not when used as streams) + # + # We also allow operators following an opening parenthesis, since + # those tend to be macros that deal with operators. + match = Search(r'(operator|[^\s(<])(?:L|UL|ULL|l|ul|ull)?<<([^\s,=<])', + line) + if (match and + not (match.group(1).isdigit() and match.group(2).isdigit()) and + not (match.group(1) == 'operator' and match.group(2) == ';')): + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around <<') - # Look for > that is not surrounded by spaces. Similar to the - # above, we only trigger if both sides are missing spaces to avoid - # false positives with shifts. - match = Match(r'^(.*[^-\s>])>[^\s=>,]', line) + # We allow no-spaces around >> for almost anything. This is because + # C++11 allows ">>" to close nested templates, which accounts for + # most cases when ">>" is not followed by a space. + # + # We still warn on ">>" followed by alpha character, because that is + # likely due to ">>" being used for right shifts, e.g.: + # value >> alpha + # + # When ">>" is used to close templates, the alphanumeric letter that + # follows would be part of an identifier, and there should still be + # a space separating the template type and the identifier. + # type> alpha + match = Search(r'>>[a-zA-Z_]', line) if match: - (_, _, start_pos) = ReverseCloseExpression( - clean_lines, linenum, len(match.group(1))) - if start_pos <= -1: error(filename, linenum, 'whitespace/operators', 3, - 'Missing spaces around >') - - # We allow no-spaces around << when used like this: 10<<20, but - # not otherwise (particularly, not when used as streams) - # - # We also allow operators following an opening parenthesis, since - # those tend to be macros that deal with operators. - match = Search(r'(operator|[^\s(<])(?:L|UL|ULL|l|ul|ull)?<<([^\s,=<])', line) - if (match and not (match.group(1).isdigit() and match.group(2).isdigit()) and - not (match.group(1) == 'operator' and match.group(2) == ';')): - error(filename, linenum, 'whitespace/operators', 3, - 'Missing spaces around <<') - - # We allow no-spaces around >> for almost anything. This is because - # C++11 allows ">>" to close nested templates, which accounts for - # most cases when ">>" is not followed by a space. - # - # We still warn on ">>" followed by alpha character, because that is - # likely due to ">>" being used for right shifts, e.g.: - # value >> alpha - # - # When ">>" is used to close templates, the alphanumeric letter that - # follows would be part of an identifier, and there should still be - # a space separating the template type and the identifier. - # type> alpha - match = Search(r'>>[a-zA-Z_]', line) - if match: - error(filename, linenum, 'whitespace/operators', 3, - 'Missing spaces around >>') - - # There shouldn't be space around unary operators - match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line) - if match: - error(filename, linenum, 'whitespace/operators', 4, - 'Extra space for operator %s' % match.group(1)) + 'Missing spaces around >>') + + # There shouldn't be space around unary operators + match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line) + if match: + error(filename, linenum, 'whitespace/operators', 4, + 'Extra space for operator %s' % match.group(1)) def CheckParenthesisSpacing(filename, clean_lines, linenum, error): - """Checks for horizontal spacing around parentheses. + """Checks for horizontal spacing around parentheses. Args: filename: The name of the current file. @@ -3257,37 +3274,36 @@ def CheckParenthesisSpacing(filename, clean_lines, linenum, error): linenum: The number of the line to check. error: The function to call with any errors found. """ - line = clean_lines.elided[linenum] - - # No spaces after an if, while, switch, or for - match = Search(r' (if\(|for\(|while\(|switch\()', line) - if match: - error(filename, linenum, 'whitespace/parens', 5, - 'Missing space before ( in %s' % match.group(1)) - - # For if/for/while/switch, the left and right parens should be - # consistent about how many spaces are inside the parens, and - # there should either be zero or one spaces inside the parens. - # We don't want: "if ( foo)" or "if ( foo )". - # Exception: "for ( ; foo; bar)" and "for (foo; bar; )" are allowed. - match = Search(r'\b(if|for|while|switch)\s*' - r'\(([ ]*)(.).*[^ ]+([ ]*)\)\s*{\s*$', - line) - if match: - if len(match.group(2)) != len(match.group(4)): - if not (match.group(3) == ';' and - len(match.group(2)) == 1 + len(match.group(4)) or - not match.group(2) and Search(r'\bfor\s*\(.*; \)', line)): + line = clean_lines.elided[linenum] + + # No spaces after an if, while, switch, or for + match = Search(r' (if\(|for\(|while\(|switch\()', line) + if match: error(filename, linenum, 'whitespace/parens', 5, - 'Mismatching spaces inside () in %s' % match.group(1)) - if len(match.group(2)) not in [0, 1]: - error(filename, linenum, 'whitespace/parens', 5, - 'Should have zero or one spaces inside ( and ) in %s' % - match.group(1)) + 'Missing space before ( in %s' % match.group(1)) + + # For if/for/while/switch, the left and right parens should be + # consistent about how many spaces are inside the parens, and + # there should either be zero or one spaces inside the parens. + # We don't want: "if ( foo)" or "if ( foo )". + # Exception: "for ( ; foo; bar)" and "for (foo; bar; )" are allowed. + match = Search(r'\b(if|for|while|switch)\s*' + r'\(([ ]*)(.).*[^ ]+([ ]*)\)\s*{\s*$', line) + if match: + if len(match.group(2)) != len(match.group(4)): + if not (match.group(3) == ';' and + len(match.group(2)) == 1 + len(match.group(4)) or + not match.group(2) and Search(r'\bfor\s*\(.*; \)', line)): + error(filename, linenum, 'whitespace/parens', 5, + 'Mismatching spaces inside () in %s' % match.group(1)) + if len(match.group(2)) not in [0, 1]: + error(filename, linenum, 'whitespace/parens', 5, + 'Should have zero or one spaces inside ( and ) in %s' % + match.group(1)) def CheckCommaSpacing(filename, clean_lines, linenum, error): - """Checks for horizontal spacing near commas and semicolons. + """Checks for horizontal spacing near commas and semicolons. Args: filename: The name of the current file. @@ -3295,35 +3311,34 @@ def CheckCommaSpacing(filename, clean_lines, linenum, error): linenum: The number of the line to check. error: The function to call with any errors found. """ - raw = clean_lines.lines_without_raw_strings - line = clean_lines.elided[linenum] - - # You should always have a space after a comma (either as fn arg or operator) - # - # This does not apply when the non-space character following the - # comma is another comma, since the only time when that happens is - # for empty macro arguments. - # - # We run this check in two passes: first pass on elided lines to - # verify that lines contain missing whitespaces, second pass on raw - # lines to confirm that those missing whitespaces are not due to - # elided comments. - if (Search(r',[^,\s]', ReplaceAll(r'\boperator\s*,\s*\(', 'F(', line)) and - Search(r',[^,\s]', raw[linenum])): - error(filename, linenum, 'whitespace/comma', 3, - 'Missing space after ,') - - # You should always have a space after a semicolon - # except for few corner cases - # TODO(unknown): clarify if 'if (1) { return 1;}' is requires one more - # space after ; - if Search(r';[^\s};\\)/]', line): - error(filename, linenum, 'whitespace/semicolon', 3, - 'Missing space after ;') + raw = clean_lines.lines_without_raw_strings + line = clean_lines.elided[linenum] + + # You should always have a space after a comma (either as fn arg or operator) + # + # This does not apply when the non-space character following the + # comma is another comma, since the only time when that happens is + # for empty macro arguments. + # + # We run this check in two passes: first pass on elided lines to + # verify that lines contain missing whitespaces, second pass on raw + # lines to confirm that those missing whitespaces are not due to + # elided comments. + if (Search(r',[^,\s]', ReplaceAll(r'\boperator\s*,\s*\(', 'F(', line)) and + Search(r',[^,\s]', raw[linenum])): + error(filename, linenum, 'whitespace/comma', 3, 'Missing space after ,') + + # You should always have a space after a semicolon + # except for few corner cases + # TODO(unknown): clarify if 'if (1) { return 1;}' is requires one more + # space after ; + if Search(r';[^\s};\\)/]', line): + error(filename, linenum, 'whitespace/semicolon', 3, + 'Missing space after ;') def CheckBracesSpacing(filename, clean_lines, linenum, error): - """Checks for horizontal spacing near commas. + """Checks for horizontal spacing near commas. Args: filename: The name of the current file. @@ -3331,78 +3346,78 @@ def CheckBracesSpacing(filename, clean_lines, linenum, error): linenum: The number of the line to check. error: The function to call with any errors found. """ - line = clean_lines.elided[linenum] - - # Except after an opening paren, or after another opening brace (in case of - # an initializer list, for instance), you should have spaces before your - # braces. And since you should never have braces at the beginning of a line, - # this is an easy test. - match = Match(r'^(.*[^ ({>]){', line) - if match: - # Try a bit harder to check for brace initialization. This - # happens in one of the following forms: - # Constructor() : initializer_list_{} { ... } - # Constructor{}.MemberFunction() - # Type variable{}; - # FunctionCall(type{}, ...); - # LastArgument(..., type{}); - # LOG(INFO) << type{} << " ..."; - # map_of_type[{...}] = ...; - # ternary = expr ? new type{} : nullptr; - # OuterTemplate{}> - # - # We check for the character following the closing brace, and - # silence the warning if it's one of those listed above, i.e. - # "{.;,)<>]:". - # - # To account for nested initializer list, we allow any number of - # closing braces up to "{;,)<". We can't simply silence the - # warning on first sight of closing brace, because that would - # cause false negatives for things that are not initializer lists. - # Silence this: But not this: - # Outer{ if (...) { - # Inner{...} if (...){ // Missing space before { - # }; } - # - # There is a false negative with this approach if people inserted - # spurious semicolons, e.g. "if (cond){};", but we will catch the - # spurious semicolon with a separate check. - (endline, endlinenum, endpos) = CloseExpression( - clean_lines, linenum, len(match.group(1))) - trailing_text = '' - if endpos > -1: - trailing_text = endline[endpos:] - for offset in xrange(endlinenum + 1, - min(endlinenum + 3, clean_lines.NumLines() - 1)): - trailing_text += clean_lines.elided[offset] - if not Match(r'^[\s}]*[{.;,)<>\]:]', trailing_text): - error(filename, linenum, 'whitespace/braces', 5, - 'Missing space before {') - - # Make sure '} else {' has spaces. - if Search(r'}else', line): - error(filename, linenum, 'whitespace/braces', 5, - 'Missing space before else') - - # You shouldn't have a space before a semicolon at the end of the line. - # There's a special case for "for" since the style guide allows space before - # the semicolon there. - if Search(r':\s*;\s*$', line): - error(filename, linenum, 'whitespace/semicolon', 5, - 'Semicolon defining empty statement. Use {} instead.') - elif Search(r'^\s*;\s*$', line): - error(filename, linenum, 'whitespace/semicolon', 5, - 'Line contains only semicolon. If this should be an empty statement, ' - 'use {} instead.') - elif (Search(r'\s+;\s*$', line) and - not Search(r'\bfor\b', line)): - error(filename, linenum, 'whitespace/semicolon', 5, - 'Extra space before last semicolon. If this should be an empty ' - 'statement, use {} instead.') + line = clean_lines.elided[linenum] + + # Except after an opening paren, or after another opening brace (in case of + # an initializer list, for instance), you should have spaces before your + # braces. And since you should never have braces at the beginning of a line, + # this is an easy test. + match = Match(r'^(.*[^ ({>]){', line) + if match: + # Try a bit harder to check for brace initialization. This + # happens in one of the following forms: + # Constructor() : initializer_list_{} { ... } + # Constructor{}.MemberFunction() + # Type variable{}; + # FunctionCall(type{}, ...); + # LastArgument(..., type{}); + # LOG(INFO) << type{} << " ..."; + # map_of_type[{...}] = ...; + # ternary = expr ? new type{} : nullptr; + # OuterTemplate{}> + # + # We check for the character following the closing brace, and + # silence the warning if it's one of those listed above, i.e. + # "{.;,)<>]:". + # + # To account for nested initializer list, we allow any number of + # closing braces up to "{;,)<". We can't simply silence the + # warning on first sight of closing brace, because that would + # cause false negatives for things that are not initializer lists. + # Silence this: But not this: + # Outer{ if (...) { + # Inner{...} if (...){ // Missing space before { + # }; } + # + # There is a false negative with this approach if people inserted + # spurious semicolons, e.g. "if (cond){};", but we will catch the + # spurious semicolon with a separate check. + (endline, endlinenum, endpos) = CloseExpression(clean_lines, linenum, + len(match.group(1))) + trailing_text = '' + if endpos > -1: + trailing_text = endline[endpos:] + for offset in xrange(endlinenum + 1, + min(endlinenum + 3, clean_lines.NumLines() - 1)): + trailing_text += clean_lines.elided[offset] + if not Match(r'^[\s}]*[{.;,)<>\]:]', trailing_text): + error(filename, linenum, 'whitespace/braces', 5, + 'Missing space before {') + + # Make sure '} else {' has spaces. + if Search(r'}else', line): + error(filename, linenum, 'whitespace/braces', 5, + 'Missing space before else') + + # You shouldn't have a space before a semicolon at the end of the line. + # There's a special case for "for" since the style guide allows space before + # the semicolon there. + if Search(r':\s*;\s*$', line): + error(filename, linenum, 'whitespace/semicolon', 5, + 'Semicolon defining empty statement. Use {} instead.') + elif Search(r'^\s*;\s*$', line): + error( + filename, linenum, 'whitespace/semicolon', 5, + 'Line contains only semicolon. If this should be an empty statement, ' + 'use {} instead.') + elif (Search(r'\s+;\s*$', line) and not Search(r'\bfor\b', line)): + error(filename, linenum, 'whitespace/semicolon', 5, + 'Extra space before last semicolon. If this should be an empty ' + 'statement, use {} instead.') def IsDecltype(clean_lines, linenum, column): - """Check if the token ending on (linenum, column) is decltype(). + """Check if the token ending on (linenum, column) is decltype(). Args: clean_lines: A CleansedLines instance containing the file. @@ -3411,16 +3426,16 @@ def IsDecltype(clean_lines, linenum, column): Returns: True if this token is decltype() expression, False otherwise. """ - (text, _, start_col) = ReverseCloseExpression(clean_lines, linenum, column) - if start_col < 0: + (text, _, start_col) = ReverseCloseExpression(clean_lines, linenum, column) + if start_col < 0: + return False + if Search(r'\bdecltype\s*$', text[0:start_col]): + return True return False - if Search(r'\bdecltype\s*$', text[0:start_col]): - return True - return False def IsTemplateParameterList(clean_lines, linenum, column): - """Check if the token ending on (linenum, column) is the end of template<>. + """Check if the token ending on (linenum, column) is the end of template<>. Args: clean_lines: A CleansedLines instance containing the file. @@ -3429,16 +3444,16 @@ def IsTemplateParameterList(clean_lines, linenum, column): Returns: True if this token is end of a template parameter list, False otherwise. """ - (_, startline, startpos) = ReverseCloseExpression( - clean_lines, linenum, column) - if (startpos > -1 and - Search(r'\btemplate\s*$', clean_lines.elided[startline][0:startpos])): - return True - return False + (_, startline, startpos) = ReverseCloseExpression(clean_lines, linenum, + column) + if (startpos > -1 and Search(r'\btemplate\s*$', + clean_lines.elided[startline][0:startpos])): + return True + return False def IsRValueType(typenames, clean_lines, nesting_state, linenum, column): - """Check if the token ending on (linenum, column) is a type. + """Check if the token ending on (linenum, column) is a type. Assumes that text to the right of the column is "&&" or a function name. @@ -3453,196 +3468,198 @@ def IsRValueType(typenames, clean_lines, nesting_state, linenum, column): Returns: True if this token is a type, False if we are not sure. """ - prefix = clean_lines.elided[linenum][0:column] - - # Get one word to the left. If we failed to do so, this is most - # likely not a type, since it's unlikely that the type name and "&&" - # would be split across multiple lines. - match = Match(r'^(.*)(\b\w+|[>*)&])\s*$', prefix) - if not match: - return False + prefix = clean_lines.elided[linenum][0:column] - # Check text following the token. If it's "&&>" or "&&," or "&&...", it's - # most likely a rvalue reference used inside a template. - suffix = clean_lines.elided[linenum][column:] - if Match(r'&&\s*(?:[>,]|\.\.\.)', suffix): - return True + # Get one word to the left. If we failed to do so, this is most + # likely not a type, since it's unlikely that the type name and "&&" + # would be split across multiple lines. + match = Match(r'^(.*)(\b\w+|[>*)&])\s*$', prefix) + if not match: + return False - # Check for known types and end of templates: - # int&& variable - # vector&& variable - # - # Because this function is called recursively, we also need to - # recognize pointer and reference types: - # int* Function() - # int& Function() - if (match.group(2) in typenames or - match.group(2) in ['char', 'char16_t', 'char32_t', 'wchar_t', 'bool', - 'short', 'int', 'long', 'signed', 'unsigned', - 'float', 'double', 'void', 'auto', '>', '*', '&']): - return True + # Check text following the token. If it's "&&>" or "&&," or "&&...", it's + # most likely a rvalue reference used inside a template. + suffix = clean_lines.elided[linenum][column:] + if Match(r'&&\s*(?:[>,]|\.\.\.)', suffix): + return True - # If we see a close parenthesis, look for decltype on the other side. - # decltype would unambiguously identify a type, anything else is - # probably a parenthesized expression and not a type. - if match.group(2) == ')': - return IsDecltype( - clean_lines, linenum, len(match.group(1)) + len(match.group(2)) - 1) - - # Check for casts and cv-qualifiers. - # match.group(1) remainder - # -------------- --------- - # const_cast< type&& - # const type&& - # type const&& - if Search(r'\b(?:const_cast\s*<|static_cast\s*<|dynamic_cast\s*<|' - r'reinterpret_cast\s*<|\w+\s)\s*$', - match.group(1)): - return True + # Check for known types and end of templates: + # int&& variable + # vector&& variable + # + # Because this function is called recursively, we also need to + # recognize pointer and reference types: + # int* Function() + # int& Function() + if (match.group(2) in typenames or match.group(2) in [ + 'char', 'char16_t', 'char32_t', 'wchar_t', 'bool', 'short', 'int', + 'long', 'signed', 'unsigned', 'float', 'double', 'void', 'auto', + '>', '*', '&' + ]): + return True - # Look for a preceding symbol that might help differentiate the context. - # These are the cases that would be ambiguous: - # match.group(1) remainder - # -------------- --------- - # Call ( expression && - # Declaration ( type&& - # sizeof ( type&& - # if ( expression && - # while ( expression && - # for ( type&& - # for( ; expression && - # statement ; type&& - # block { type&& - # constructor { expression && - start = linenum - line = match.group(1) - match_symbol = None - while start >= 0: - # We want to skip over identifiers and commas to get to a symbol. - # Commas are skipped so that we can find the opening parenthesis - # for function parameter lists. - match_symbol = Match(r'^(.*)([^\w\s,])[\w\s,]*$', line) - if match_symbol: - break - start -= 1 - line = clean_lines.elided[start] - - if not match_symbol: - # Probably the first statement in the file is an rvalue reference - return True + # If we see a close parenthesis, look for decltype on the other side. + # decltype would unambiguously identify a type, anything else is + # probably a parenthesized expression and not a type. + if match.group(2) == ')': + return IsDecltype(clean_lines, linenum, + len(match.group(1)) + len(match.group(2)) - 1) + + # Check for casts and cv-qualifiers. + # match.group(1) remainder + # -------------- --------- + # const_cast< type&& + # const type&& + # type const&& + if Search(r'\b(?:const_cast\s*<|static_cast\s*<|dynamic_cast\s*<|' + r'reinterpret_cast\s*<|\w+\s)\s*$', match.group(1)): + return True - if match_symbol.group(2) == '}': - # Found closing brace, probably an indicate of this: - # block{} type&& - return True + # Look for a preceding symbol that might help differentiate the context. + # These are the cases that would be ambiguous: + # match.group(1) remainder + # -------------- --------- + # Call ( expression && + # Declaration ( type&& + # sizeof ( type&& + # if ( expression && + # while ( expression && + # for ( type&& + # for( ; expression && + # statement ; type&& + # block { type&& + # constructor { expression && + start = linenum + line = match.group(1) + match_symbol = None + while start >= 0: + # We want to skip over identifiers and commas to get to a symbol. + # Commas are skipped so that we can find the opening parenthesis + # for function parameter lists. + match_symbol = Match(r'^(.*)([^\w\s,])[\w\s,]*$', line) + if match_symbol: + break + start -= 1 + line = clean_lines.elided[start] - if match_symbol.group(2) == ';': - # Found semicolon, probably one of these: - # for(; expression && - # statement; type&& - - # Look for the previous 'for(' in the previous lines. - before_text = match_symbol.group(1) - for i in xrange(start - 1, max(start - 6, 0), -1): - before_text = clean_lines.elided[i] + before_text - if Search(r'for\s*\([^{};]*$', before_text): - # This is the condition inside a for-loop - return False - - # Did not find a for-init-statement before this semicolon, so this - # is probably a new statement and not a condition. - return True + if not match_symbol: + # Probably the first statement in the file is an rvalue reference + return True - if match_symbol.group(2) == '{': - # Found opening brace, probably one of these: - # block{ type&& = ... ; } - # constructor{ expression && expression } + if match_symbol.group(2) == '}': + # Found closing brace, probably an indicate of this: + # block{} type&& + return True - # Look for a closing brace or a semicolon. If we see a semicolon - # first, this is probably a rvalue reference. - line = clean_lines.elided[start][0:len(match_symbol.group(1)) + 1] - end = start - depth = 1 - while True: - for ch in line: - if ch == ';': - return True - elif ch == '{': - depth += 1 - elif ch == '}': - depth -= 1 - if depth == 0: + if match_symbol.group(2) == ';': + # Found semicolon, probably one of these: + # for(; expression && + # statement; type&& + + # Look for the previous 'for(' in the previous lines. + before_text = match_symbol.group(1) + for i in xrange(start - 1, max(start - 6, 0), -1): + before_text = clean_lines.elided[i] + before_text + if Search(r'for\s*\([^{};]*$', before_text): + # This is the condition inside a for-loop return False - end += 1 - if end >= clean_lines.NumLines(): - break - line = clean_lines.elided[end] - # Incomplete program? - return False - if match_symbol.group(2) == '(': - # Opening parenthesis. Need to check what's to the left of the - # parenthesis. Look back one extra line for additional context. - before_text = match_symbol.group(1) - if linenum > 1: - before_text = clean_lines.elided[linenum - 1] + before_text - before_text = match_symbol.group(1) - - # Patterns that are likely to be types: - # [](type&& - # for (type&& - # sizeof(type&& - # operator=(type&& - # - if Search(r'(?:\]|\bfor|\bsizeof|\boperator\s*\S+\s*)\s*$', before_text): - return True - - # Patterns that are likely to be expressions: - # if (expression && - # while (expression && - # : initializer(expression && - # , initializer(expression && - # ( FunctionCall(expression && - # + FunctionCall(expression && - # + (expression && - # - # The last '+' represents operators such as '+' and '-'. - if Search(r'(?:\bif|\bwhile|[-+=%^(]*>)?\s*$', - match_symbol.group(1)) - if match_func: - # Check for constructors, which don't have return types. - if Search(r'\b(?:explicit|inline)$', match_func.group(1)): - return True - implicit_constructor = Match(r'\s*(\w+)\((?:const\s+)?(\w+)', prefix) - if (implicit_constructor and - implicit_constructor.group(1) == implicit_constructor.group(2)): + # Did not find a for-init-statement before this semicolon, so this + # is probably a new statement and not a condition. return True - return IsRValueType(typenames, clean_lines, nesting_state, linenum, - len(match_func.group(1))) - # Nothing before the function name. If this is inside a block scope, - # this is probably a function call. - return not (nesting_state.previous_stack_top and - nesting_state.previous_stack_top.IsBlockInfo()) + if match_symbol.group(2) == '{': + # Found opening brace, probably one of these: + # block{ type&& = ... ; } + # constructor{ expression && expression } + + # Look for a closing brace or a semicolon. If we see a semicolon + # first, this is probably a rvalue reference. + line = clean_lines.elided[start][0:len(match_symbol.group(1)) + 1] + end = start + depth = 1 + while True: + for ch in line: + if ch == ';': + return True + elif ch == '{': + depth += 1 + elif ch == '}': + depth -= 1 + if depth == 0: + return False + end += 1 + if end >= clean_lines.NumLines(): + break + line = clean_lines.elided[end] + # Incomplete program? + return False - if match_symbol.group(2) == '>': - # Possibly a closing bracket, check that what's on the other side - # looks like the start of a template. - return IsTemplateParameterList( - clean_lines, start, len(match_symbol.group(1))) + if match_symbol.group(2) == '(': + # Opening parenthesis. Need to check what's to the left of the + # parenthesis. Look back one extra line for additional context. + before_text = match_symbol.group(1) + if linenum > 1: + before_text = clean_lines.elided[linenum - 1] + before_text + before_text = match_symbol.group(1) + + # Patterns that are likely to be types: + # [](type&& + # for (type&& + # sizeof(type&& + # operator=(type&& + # + if Search(r'(?:\]|\bfor|\bsizeof|\boperator\s*\S+\s*)\s*$', + before_text): + return True + + # Patterns that are likely to be expressions: + # if (expression && + # while (expression && + # : initializer(expression && + # , initializer(expression && + # ( FunctionCall(expression && + # + FunctionCall(expression && + # + (expression && + # + # The last '+' represents operators such as '+' and '-'. + if Search(r'(?:\bif|\bwhile|[-+=%^(]*>)?\s*$', + match_symbol.group(1)) + if match_func: + # Check for constructors, which don't have return types. + if Search(r'\b(?:explicit|inline)$', match_func.group(1)): + return True + implicit_constructor = Match(r'\s*(\w+)\((?:const\s+)?(\w+)', + prefix) + if (implicit_constructor and implicit_constructor.group(1) == + implicit_constructor.group(2)): + return True + return IsRValueType(typenames, clean_lines, nesting_state, linenum, + len(match_func.group(1))) + + # Nothing before the function name. If this is inside a block scope, + # this is probably a function call. + return not (nesting_state.previous_stack_top and + nesting_state.previous_stack_top.IsBlockInfo()) + + if match_symbol.group(2) == '>': + # Possibly a closing bracket, check that what's on the other side + # looks like the start of a template. + return IsTemplateParameterList(clean_lines, start, + len(match_symbol.group(1))) + + # Some other symbol, usually something like "a=b&&c". This is most + # likely not a type. + return False def IsDeletedOrDefault(clean_lines, linenum): - """Check if current constructor or operator is deleted or default. + """Check if current constructor or operator is deleted or default. Args: clean_lines: A CleansedLines instance containing the file. @@ -3650,18 +3667,18 @@ def IsDeletedOrDefault(clean_lines, linenum): Returns: True if this is a deleted or default constructor. """ - open_paren = clean_lines.elided[linenum].find('(') - if open_paren < 0: - return False - (close_line, _, close_paren) = CloseExpression( - clean_lines, linenum, open_paren) - if close_paren < 0: - return False - return Match(r'\s*=\s*(?:delete|default)\b', close_line[close_paren:]) + open_paren = clean_lines.elided[linenum].find('(') + if open_paren < 0: + return False + (close_line, _, close_paren) = CloseExpression(clean_lines, linenum, + open_paren) + if close_paren < 0: + return False + return Match(r'\s*=\s*(?:delete|default)\b', close_line[close_paren:]) def IsRValueAllowed(clean_lines, linenum, typenames): - """Check if RValue reference is allowed on a particular line. + """Check if RValue reference is allowed on a particular line. Args: clean_lines: A CleansedLines instance containing the file. @@ -3670,56 +3687,57 @@ def IsRValueAllowed(clean_lines, linenum, typenames): Returns: True if line is within the region where RValue references are allowed. """ - # Allow region marked by PUSH/POP macros - for i in xrange(linenum, 0, -1): - line = clean_lines.elided[i] - if Match(r'GOOGLE_ALLOW_RVALUE_REFERENCES_(?:PUSH|POP)', line): - if not line.endswith('PUSH'): - return False - for j in xrange(linenum, clean_lines.NumLines(), 1): - line = clean_lines.elided[j] + # Allow region marked by PUSH/POP macros + for i in xrange(linenum, 0, -1): + line = clean_lines.elided[i] if Match(r'GOOGLE_ALLOW_RVALUE_REFERENCES_(?:PUSH|POP)', line): - return line.endswith('POP') - - # Allow operator= - line = clean_lines.elided[linenum] - if Search(r'\boperator\s*=\s*\(', line): - return IsDeletedOrDefault(clean_lines, linenum) - - # Allow constructors - match = Match(r'\s*(?:[\w<>]+::)*([\w<>]+)\s*::\s*([\w<>]+)\s*\(', line) - if match and match.group(1) == match.group(2): - return IsDeletedOrDefault(clean_lines, linenum) - if Search(r'\b(?:explicit|inline)\s+[\w<>]+\s*\(', line): - return IsDeletedOrDefault(clean_lines, linenum) - - if Match(r'\s*[\w<>]+\s*\(', line): - previous_line = 'ReturnType' - if linenum > 0: - previous_line = clean_lines.elided[linenum - 1] - if Match(r'^\s*$', previous_line) or Search(r'[{}:;]\s*$', previous_line): - return IsDeletedOrDefault(clean_lines, linenum) + if not line.endswith('PUSH'): + return False + for j in xrange(linenum, clean_lines.NumLines(), 1): + line = clean_lines.elided[j] + if Match(r'GOOGLE_ALLOW_RVALUE_REFERENCES_(?:PUSH|POP)', line): + return line.endswith('POP') - # Reject types not mentioned in template-argument-list - while line: - match = Match(r'^.*?(\w+)\s*&&(.*)$', line) - if not match: - break - if match.group(1) not in typenames: - return False - line = match.group(2) + # Allow operator= + line = clean_lines.elided[linenum] + if Search(r'\boperator\s*=\s*\(', line): + return IsDeletedOrDefault(clean_lines, linenum) + + # Allow constructors + match = Match(r'\s*(?:[\w<>]+::)*([\w<>]+)\s*::\s*([\w<>]+)\s*\(', line) + if match and match.group(1) == match.group(2): + return IsDeletedOrDefault(clean_lines, linenum) + if Search(r'\b(?:explicit|inline)\s+[\w<>]+\s*\(', line): + return IsDeletedOrDefault(clean_lines, linenum) + + if Match(r'\s*[\w<>]+\s*\(', line): + previous_line = 'ReturnType' + if linenum > 0: + previous_line = clean_lines.elided[linenum - 1] + if Match(r'^\s*$', previous_line) or Search(r'[{}:;]\s*$', + previous_line): + return IsDeletedOrDefault(clean_lines, linenum) + + # Reject types not mentioned in template-argument-list + while line: + match = Match(r'^.*?(\w+)\s*&&(.*)$', line) + if not match: + break + if match.group(1) not in typenames: + return False + line = match.group(2) - # All RValue types that were in template-argument-list should have - # been removed by now. Those were allowed, assuming that they will - # be forwarded. - # - # If there are no remaining RValue types left (i.e. types that were - # not found in template-argument-list), flag those as not allowed. - return line.find('&&') < 0 + # All RValue types that were in template-argument-list should have + # been removed by now. Those were allowed, assuming that they will + # be forwarded. + # + # If there are no remaining RValue types left (i.e. types that were + # not found in template-argument-list), flag those as not allowed. + return line.find('&&') < 0 def GetTemplateArgs(clean_lines, linenum): - """Find list of template arguments associated with this function declaration. + """Find list of template arguments associated with this function declaration. Args: clean_lines: A CleansedLines instance containing the file. @@ -3729,61 +3747,63 @@ def GetTemplateArgs(clean_lines, linenum): Set of type names, or empty set if this does not appear to have any template parameters. """ - # Find start of function - func_line = linenum - while func_line > 0: - line = clean_lines.elided[func_line] - if Match(r'^\s*$', line): - return set() - if line.find('(') >= 0: - break - func_line -= 1 - if func_line == 0: - return set() - - # Collapse template-argument-list into a single string - argument_list = '' - match = Match(r'^(\s*template\s*)<', clean_lines.elided[func_line]) - if match: - # template-argument-list on the same line as function name - start_col = len(match.group(1)) - _, end_line, end_col = CloseExpression(clean_lines, func_line, start_col) - if end_col > -1 and end_line == func_line: - start_col += 1 # Skip the opening bracket - argument_list = clean_lines.elided[func_line][start_col:end_col] - - elif func_line > 1: - # template-argument-list one line before function name - match = Match(r'^(.*)>\s*$', clean_lines.elided[func_line - 1]) + # Find start of function + func_line = linenum + while func_line > 0: + line = clean_lines.elided[func_line] + if Match(r'^\s*$', line): + return set() + if line.find('(') >= 0: + break + func_line -= 1 + if func_line == 0: + return set() + + # Collapse template-argument-list into a single string + argument_list = '' + match = Match(r'^(\s*template\s*)<', clean_lines.elided[func_line]) if match: - end_col = len(match.group(1)) - _, start_line, start_col = ReverseCloseExpression( - clean_lines, func_line - 1, end_col) - if start_col > -1: - start_col += 1 # Skip the opening bracket - while start_line < func_line - 1: - argument_list += clean_lines.elided[start_line][start_col:] - start_col = 0 - start_line += 1 - argument_list += clean_lines.elided[func_line - 1][start_col:end_col] - - if not argument_list: - return set() - - # Extract type names - typenames = set() - while True: - match = Match(r'^[,\s]*(?:typename|class)(?:\.\.\.)?\s+(\w+)(.*)$', - argument_list) - if not match: - break - typenames.add(match.group(1)) - argument_list = match.group(2) - return typenames + # template-argument-list on the same line as function name + start_col = len(match.group(1)) + _, end_line, end_col = CloseExpression(clean_lines, func_line, + start_col) + if end_col > -1 and end_line == func_line: + start_col += 1 # Skip the opening bracket + argument_list = clean_lines.elided[func_line][start_col:end_col] + + elif func_line > 1: + # template-argument-list one line before function name + match = Match(r'^(.*)>\s*$', clean_lines.elided[func_line - 1]) + if match: + end_col = len(match.group(1)) + _, start_line, start_col = ReverseCloseExpression( + clean_lines, func_line - 1, end_col) + if start_col > -1: + start_col += 1 # Skip the opening bracket + while start_line < func_line - 1: + argument_list += clean_lines.elided[start_line][start_col:] + start_col = 0 + start_line += 1 + argument_list += clean_lines.elided[func_line - 1][start_col: + end_col] + + if not argument_list: + return set() + + # Extract type names + typenames = set() + while True: + match = Match(r'^[,\s]*(?:typename|class)(?:\.\.\.)?\s+(\w+)(.*)$', + argument_list) + if not match: + break + typenames.add(match.group(1)) + argument_list = match.group(2) + return typenames def CheckRValueReference(filename, clean_lines, linenum, nesting_state, error): - """Check for rvalue references. + """Check for rvalue references. Args: filename: The name of the current file. @@ -3793,33 +3813,34 @@ def CheckRValueReference(filename, clean_lines, linenum, nesting_state, error): the current stack of nested blocks being parsed. error: The function to call with any errors found. """ - # Find lines missing spaces around &&. - # TODO(unknown): currently we don't check for rvalue references - # with spaces surrounding the && to avoid false positives with - # boolean expressions. - line = clean_lines.elided[linenum] - match = Match(r'^(.*\S)&&', line) - if not match: - match = Match(r'(.*)&&\S', line) - if (not match) or '(&&)' in line or Search(r'\boperator\s*$', match.group(1)): - return - - # Either poorly formed && or an rvalue reference, check the context - # to get a more accurate error message. Mostly we want to determine - # if what's to the left of "&&" is a type or not. - typenames = GetTemplateArgs(clean_lines, linenum) - and_pos = len(match.group(1)) - if IsRValueType(typenames, clean_lines, nesting_state, linenum, and_pos): - if not IsRValueAllowed(clean_lines, linenum, typenames): - error(filename, linenum, 'build/c++11', 3, - 'RValue references are an unapproved C++ feature.') - else: - error(filename, linenum, 'whitespace/operators', 3, - 'Missing spaces around &&') + # Find lines missing spaces around &&. + # TODO(unknown): currently we don't check for rvalue references + # with spaces surrounding the && to avoid false positives with + # boolean expressions. + line = clean_lines.elided[linenum] + match = Match(r'^(.*\S)&&', line) + if not match: + match = Match(r'(.*)&&\S', line) + if (not match) or '(&&)' in line or Search(r'\boperator\s*$', + match.group(1)): + return + + # Either poorly formed && or an rvalue reference, check the context + # to get a more accurate error message. Mostly we want to determine + # if what's to the left of "&&" is a type or not. + typenames = GetTemplateArgs(clean_lines, linenum) + and_pos = len(match.group(1)) + if IsRValueType(typenames, clean_lines, nesting_state, linenum, and_pos): + if not IsRValueAllowed(clean_lines, linenum, typenames): + error(filename, linenum, 'build/c++11', 3, + 'RValue references are an unapproved C++ feature.') + else: + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around &&') def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error): - """Checks for additional blank line issues related to sections. + """Checks for additional blank line issues related to sections. Currently the only thing checked here is blank line before protected/private. @@ -3830,51 +3851,53 @@ def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error): linenum: The number of the line to check. error: The function to call with any errors found. """ - # Skip checks if the class is small, where small means 25 lines or less. - # 25 lines seems like a good cutoff since that's the usual height of - # terminals, and any class that can't fit in one screen can't really - # be considered "small". - # - # Also skip checks if we are on the first line. This accounts for - # classes that look like - # class Foo { public: ... }; - # - # If we didn't find the end of the class, last_line would be zero, - # and the check will be skipped by the first condition. - if (class_info.last_line - class_info.starting_linenum <= 24 or - linenum <= class_info.starting_linenum): - return - - matched = Match(r'\s*(public|protected|private):', clean_lines.lines[linenum]) - if matched: - # Issue warning if the line before public/protected/private was - # not a blank line, but don't do this if the previous line contains - # "class" or "struct". This can happen two ways: - # - We are at the beginning of the class. - # - We are forward-declaring an inner class that is semantically - # private, but needed to be public for implementation reasons. - # Also ignores cases where the previous line ends with a backslash as can be - # common when defining classes in C macros. - prev_line = clean_lines.lines[linenum - 1] - if (not IsBlankLine(prev_line) and - not Search(r'\b(class|struct)\b', prev_line) and - not Search(r'\\$', prev_line)): - # Try a bit harder to find the beginning of the class. This is to - # account for multi-line base-specifier lists, e.g.: - # class Derived - # : public Base { - end_class_head = class_info.starting_linenum - for i in range(class_info.starting_linenum, linenum): - if Search(r'\{\s*$', clean_lines.lines[i]): - end_class_head = i - break - if end_class_head < linenum - 1: - error(filename, linenum, 'whitespace/blank_line', 3, - '"%s:" should be preceded by a blank line' % matched.group(1)) + # Skip checks if the class is small, where small means 25 lines or less. + # 25 lines seems like a good cutoff since that's the usual height of + # terminals, and any class that can't fit in one screen can't really + # be considered "small". + # + # Also skip checks if we are on the first line. This accounts for + # classes that look like + # class Foo { public: ... }; + # + # If we didn't find the end of the class, last_line would be zero, + # and the check will be skipped by the first condition. + if (class_info.last_line - class_info.starting_linenum <= 24 or + linenum <= class_info.starting_linenum): + return + + matched = Match(r'\s*(public|protected|private):', + clean_lines.lines[linenum]) + if matched: + # Issue warning if the line before public/protected/private was + # not a blank line, but don't do this if the previous line contains + # "class" or "struct". This can happen two ways: + # - We are at the beginning of the class. + # - We are forward-declaring an inner class that is semantically + # private, but needed to be public for implementation reasons. + # Also ignores cases where the previous line ends with a backslash as can be + # common when defining classes in C macros. + prev_line = clean_lines.lines[linenum - 1] + if (not IsBlankLine(prev_line) and + not Search(r'\b(class|struct)\b', prev_line) and + not Search(r'\\$', prev_line)): + # Try a bit harder to find the beginning of the class. This is to + # account for multi-line base-specifier lists, e.g.: + # class Derived + # : public Base { + end_class_head = class_info.starting_linenum + for i in range(class_info.starting_linenum, linenum): + if Search(r'\{\s*$', clean_lines.lines[i]): + end_class_head = i + break + if end_class_head < linenum - 1: + error(filename, linenum, 'whitespace/blank_line', 3, + '"%s:" should be preceded by a blank line' % + matched.group(1)) def GetPreviousNonBlankLine(clean_lines, linenum): - """Return the most recent non-blank line and its line number. + """Return the most recent non-blank line and its line number. Args: clean_lines: A CleansedLines instance containing the file contents. @@ -3887,17 +3910,17 @@ def GetPreviousNonBlankLine(clean_lines, linenum): if this is the first non-blank line. """ - prevlinenum = linenum - 1 - while prevlinenum >= 0: - prevline = clean_lines.elided[prevlinenum] - if not IsBlankLine(prevline): # if not a blank line... - return (prevline, prevlinenum) - prevlinenum -= 1 - return ('', -1) + prevlinenum = linenum - 1 + while prevlinenum >= 0: + prevline = clean_lines.elided[prevlinenum] + if not IsBlankLine(prevline): # if not a blank line... + return (prevline, prevlinenum) + prevlinenum -= 1 + return ('', -1) def CheckBraces(filename, clean_lines, linenum, error): - """Looks for misplaced braces (e.g. at the end of line). + """Looks for misplaced braces (e.g. at the end of line). Args: filename: The name of the current file. @@ -3906,114 +3929,123 @@ def CheckBraces(filename, clean_lines, linenum, error): error: The function to call with any errors found. """ - line = clean_lines.elided[linenum] # get rid of comments and strings - - if Match(r'\s*{\s*$', line): - # We allow an open brace to start a line in the case where someone is using - # braces in a block to explicitly create a new scope, which is commonly used - # to control the lifetime of stack-allocated variables. Braces are also - # used for brace initializers inside function calls. We don't detect this - # perfectly: we just don't complain if the last non-whitespace character on - # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the - # previous line starts a preprocessor block. - prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] - if (not Search(r'[,;:}{(]\s*$', prevline) and - not Match(r'\s*#', prevline)): - error(filename, linenum, 'whitespace/braces', 4, - '{ should almost always be at the end of the previous line') - - # An else clause should be on the same line as the preceding closing brace. - if Match(r'\s*else\b\s*(?:if\b|\{|$)', line): - prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] - if Match(r'\s*}\s*$', prevline): - error(filename, linenum, 'whitespace/newline', 4, - 'An else should appear on the same line as the preceding }') - - # If braces come on one side of an else, they should be on both. - # However, we have to worry about "else if" that spans multiple lines! - if Search(r'else if\s*\(', line): # could be multi-line if - brace_on_left = bool(Search(r'}\s*else if\s*\(', line)) - # find the ( after the if - pos = line.find('else if') - pos = line.find('(', pos) - if pos > 0: - (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos) - brace_on_right = endline[endpos:].find('{') != -1 - if brace_on_left != brace_on_right: # must be brace after if + line = clean_lines.elided[linenum] # get rid of comments and strings + + if Match(r'\s*{\s*$', line): + # We allow an open brace to start a line in the case where someone is using + # braces in a block to explicitly create a new scope, which is commonly used + # to control the lifetime of stack-allocated variables. Braces are also + # used for brace initializers inside function calls. We don't detect this + # perfectly: we just don't complain if the last non-whitespace character on + # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the + # previous line starts a preprocessor block. + prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] + if (not Search(r'[,;:}{(]\s*$', prevline) and + not Match(r'\s*#', prevline)): + error(filename, linenum, 'whitespace/braces', 4, + '{ should almost always be at the end of the previous line') + + # An else clause should be on the same line as the preceding closing brace. + if Match(r'\s*else\b\s*(?:if\b|\{|$)', line): + prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] + if Match(r'\s*}\s*$', prevline): + error(filename, linenum, 'whitespace/newline', 4, + 'An else should appear on the same line as the preceding }') + + # If braces come on one side of an else, they should be on both. + # However, we have to worry about "else if" that spans multiple lines! + if Search(r'else if\s*\(', line): # could be multi-line if + brace_on_left = bool(Search(r'}\s*else if\s*\(', line)) + # find the ( after the if + pos = line.find('else if') + pos = line.find('(', pos) + if pos > 0: + (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos) + brace_on_right = endline[endpos:].find('{') != -1 + if brace_on_left != brace_on_right: # must be brace after if + error( + filename, linenum, 'readability/braces', 5, + 'If an else has a brace on one side, it should have it on both' + ) + elif Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line): error(filename, linenum, 'readability/braces', 5, 'If an else has a brace on one side, it should have it on both') - elif Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line): - error(filename, linenum, 'readability/braces', 5, - 'If an else has a brace on one side, it should have it on both') - - # Likewise, an else should never have the else clause on the same line - if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line): - error(filename, linenum, 'whitespace/newline', 4, - 'Else clause should never be on same line as else (use 2 lines)') - - # In the same way, a do/while should never be on one line - if Match(r'\s*do [^\s{]', line): - error(filename, linenum, 'whitespace/newline', 4, - 'do/while clauses should not be on a single line') - - # Check single-line if/else bodies. The style guide says 'curly braces are not - # required for single-line statements'. We additionally allow multi-line, - # single statements, but we reject anything with more than one semicolon in - # it. This means that the first semicolon after the if should be at the end of - # its line, and the line after that should have an indent level equal to or - # lower than the if. We also check for ambiguous if/else nesting without - # braces. - if_else_match = Search(r'\b(if\s*\(|else\b)', line) - if if_else_match and not Match(r'\s*#', line): - if_indent = GetIndentLevel(line) - endline, endlinenum, endpos = line, linenum, if_else_match.end() - if_match = Search(r'\bif\s*\(', line) - if if_match: - # This could be a multiline if condition, so find the end first. - pos = if_match.end() - 1 - (endline, endlinenum, endpos) = CloseExpression(clean_lines, linenum, pos) - # Check for an opening brace, either directly after the if or on the next - # line. If found, this isn't a single-statement conditional. - if (not Match(r'\s*{', endline[endpos:]) - and not (Match(r'\s*$', endline[endpos:]) - and endlinenum < (len(clean_lines.elided) - 1) - and Match(r'\s*{', clean_lines.elided[endlinenum + 1]))): - while (endlinenum < len(clean_lines.elided) - and ';' not in clean_lines.elided[endlinenum][endpos:]): - endlinenum += 1 - endpos = 0 - if endlinenum < len(clean_lines.elided): - endline = clean_lines.elided[endlinenum] - # We allow a mix of whitespace and closing braces (e.g. for one-liner - # methods) and a single \ after the semicolon (for macros) - endpos = endline.find(';') - if not Match(r';[\s}]*(\\?)$', endline[endpos:]): - # Semicolon isn't the last character, there's something trailing. - # Output a warning if the semicolon is not contained inside - # a lambda expression. - if not Match(r'^[^{};]*\[[^\[\]]*\][^{}]*\{[^{}]*\}\s*\)*[;,]\s*$', - endline): - error(filename, linenum, 'readability/braces', 4, - 'If/else bodies with multiple statements require braces') - elif endlinenum < len(clean_lines.elided) - 1: - # Make sure the next line is dedented - next_line = clean_lines.elided[endlinenum + 1] - next_indent = GetIndentLevel(next_line) - # With ambiguous nested if statements, this will error out on the - # if that *doesn't* match the else, regardless of whether it's the - # inner one or outer one. - if (if_match and Match(r'\s*else\b', next_line) - and next_indent != if_indent): - error(filename, linenum, 'readability/braces', 4, - 'Else clause should be indented at the same level as if. ' - 'Ambiguous nested if/else chains require braces.') - elif next_indent > if_indent: - error(filename, linenum, 'readability/braces', 4, - 'If/else bodies with multiple statements require braces') + + # Likewise, an else should never have the else clause on the same line + if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line): + error(filename, linenum, 'whitespace/newline', 4, + 'Else clause should never be on same line as else (use 2 lines)') + + # In the same way, a do/while should never be on one line + if Match(r'\s*do [^\s{]', line): + error(filename, linenum, 'whitespace/newline', 4, + 'do/while clauses should not be on a single line') + + # Check single-line if/else bodies. The style guide says 'curly braces are not + # required for single-line statements'. We additionally allow multi-line, + # single statements, but we reject anything with more than one semicolon in + # it. This means that the first semicolon after the if should be at the end of + # its line, and the line after that should have an indent level equal to or + # lower than the if. We also check for ambiguous if/else nesting without + # braces. + if_else_match = Search(r'\b(if\s*\(|else\b)', line) + if if_else_match and not Match(r'\s*#', line): + if_indent = GetIndentLevel(line) + endline, endlinenum, endpos = line, linenum, if_else_match.end() + if_match = Search(r'\bif\s*\(', line) + if if_match: + # This could be a multiline if condition, so find the end first. + pos = if_match.end() - 1 + (endline, endlinenum, endpos) = CloseExpression(clean_lines, + linenum, pos) + # Check for an opening brace, either directly after the if or on the next + # line. If found, this isn't a single-statement conditional. + if (not Match(r'\s*{', endline[endpos:]) and + not (Match(r'\s*$', endline[endpos:]) and endlinenum < + (len(clean_lines.elided) - 1) and + Match(r'\s*{', clean_lines.elided[endlinenum + 1]))): + while (endlinenum < len(clean_lines.elided) and + ';' not in clean_lines.elided[endlinenum][endpos:]): + endlinenum += 1 + endpos = 0 + if endlinenum < len(clean_lines.elided): + endline = clean_lines.elided[endlinenum] + # We allow a mix of whitespace and closing braces (e.g. for one-liner + # methods) and a single \ after the semicolon (for macros) + endpos = endline.find(';') + if not Match(r';[\s}]*(\\?)$', endline[endpos:]): + # Semicolon isn't the last character, there's something trailing. + # Output a warning if the semicolon is not contained inside + # a lambda expression. + if not Match( + r'^[^{};]*\[[^\[\]]*\][^{}]*\{[^{}]*\}\s*\)*[;,]\s*$', + endline): + error( + filename, linenum, 'readability/braces', 4, + 'If/else bodies with multiple statements require braces' + ) + elif endlinenum < len(clean_lines.elided) - 1: + # Make sure the next line is dedented + next_line = clean_lines.elided[endlinenum + 1] + next_indent = GetIndentLevel(next_line) + # With ambiguous nested if statements, this will error out on the + # if that *doesn't* match the else, regardless of whether it's the + # inner one or outer one. + if (if_match and Match(r'\s*else\b', next_line) and + next_indent != if_indent): + error( + filename, linenum, 'readability/braces', 4, + 'Else clause should be indented at the same level as if. ' + 'Ambiguous nested if/else chains require braces.') + elif next_indent > if_indent: + error( + filename, linenum, 'readability/braces', 4, + 'If/else bodies with multiple statements require braces' + ) def CheckTrailingSemicolon(filename, clean_lines, linenum, error): - """Looks for redundant trailing semicolon. + """Looks for redundant trailing semicolon. Args: filename: The name of the current file. @@ -4022,135 +4054,133 @@ def CheckTrailingSemicolon(filename, clean_lines, linenum, error): error: The function to call with any errors found. """ - line = clean_lines.elided[linenum] - - # Block bodies should not be followed by a semicolon. Due to C++11 - # brace initialization, there are more places where semicolons are - # required than not, so we use a whitelist approach to check these - # rather than a blacklist. These are the places where "};" should - # be replaced by just "}": - # 1. Some flavor of block following closing parenthesis: - # for (;;) {}; - # while (...) {}; - # switch (...) {}; - # Function(...) {}; - # if (...) {}; - # if (...) else if (...) {}; - # - # 2. else block: - # if (...) else {}; - # - # 3. const member function: - # Function(...) const {}; - # - # 4. Block following some statement: - # x = 42; - # {}; - # - # 5. Block at the beginning of a function: - # Function(...) { - # {}; - # } - # - # Note that naively checking for the preceding "{" will also match - # braces inside multi-dimensional arrays, but this is fine since - # that expression will not contain semicolons. - # - # 6. Block following another block: - # while (true) {} - # {}; - # - # 7. End of namespaces: - # namespace {}; - # - # These semicolons seems far more common than other kinds of - # redundant semicolons, possibly due to people converting classes - # to namespaces. For now we do not warn for this case. - # - # Try matching case 1 first. - match = Match(r'^(.*\)\s*)\{', line) - if match: - # Matched closing parenthesis (case 1). Check the token before the - # matching opening parenthesis, and don't warn if it looks like a - # macro. This avoids these false positives: - # - macro that defines a base class - # - multi-line macro that defines a base class - # - macro that defines the whole class-head + line = clean_lines.elided[linenum] + + # Block bodies should not be followed by a semicolon. Due to C++11 + # brace initialization, there are more places where semicolons are + # required than not, so we use a whitelist approach to check these + # rather than a blacklist. These are the places where "};" should + # be replaced by just "}": + # 1. Some flavor of block following closing parenthesis: + # for (;;) {}; + # while (...) {}; + # switch (...) {}; + # Function(...) {}; + # if (...) {}; + # if (...) else if (...) {}; # - # But we still issue warnings for macros that we know are safe to - # warn, specifically: - # - TEST, TEST_F, TEST_P, MATCHER, MATCHER_P - # - TYPED_TEST - # - INTERFACE_DEF - # - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED: + # 2. else block: + # if (...) else {}; # - # We implement a whitelist of safe macros instead of a blacklist of - # unsafe macros, even though the latter appears less frequently in - # google code and would have been easier to implement. This is because - # the downside for getting the whitelist wrong means some extra - # semicolons, while the downside for getting the blacklist wrong - # would result in compile errors. + # 3. const member function: + # Function(...) const {}; # - # In addition to macros, we also don't want to warn on - # - Compound literals - # - Lambdas - # - alignas specifier with anonymous structs: - closing_brace_pos = match.group(1).rfind(')') - opening_parenthesis = ReverseCloseExpression( - clean_lines, linenum, closing_brace_pos) - if opening_parenthesis[2] > -1: - line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]] - macro = Search(r'\b([A-Z_]+)\s*$', line_prefix) - func = Match(r'^(.*\])\s*$', line_prefix) - if ((macro and - macro.group(1) not in ( - 'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST', - 'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED', - 'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or - (func and not Search(r'\boperator\s*\[\s*\]', func.group(1))) or - Search(r'\b(?:struct|union)\s+alignas\s*$', line_prefix) or - Search(r'\s+=\s*$', line_prefix)): - match = None - if (match and - opening_parenthesis[1] > 1 and - Search(r'\]\s*$', clean_lines.elided[opening_parenthesis[1] - 1])): - # Multi-line lambda-expression - match = None - - else: - # Try matching cases 2-3. - match = Match(r'^(.*(?:else|\)\s*const)\s*)\{', line) - if not match: - # Try matching cases 4-6. These are always matched on separate lines. - # - # Note that we can't simply concatenate the previous line to the - # current line and do a single match, otherwise we may output - # duplicate warnings for the blank line case: - # if (cond) { - # // blank line - # } - prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] - if prevline and Search(r'[;{}]\s*$', prevline): - match = Match(r'^(\s*)\{', line) - - # Check matching closing brace - if match: - (endline, endlinenum, endpos) = CloseExpression( - clean_lines, linenum, len(match.group(1))) - if endpos > -1 and Match(r'^\s*;', endline[endpos:]): - # Current {} pair is eligible for semicolon check, and we have found - # the redundant semicolon, output warning here. - # - # Note: because we are scanning forward for opening braces, and - # outputting warnings for the matching closing brace, if there are - # nested blocks with trailing semicolons, we will get the error - # messages in reversed order. - error(filename, endlinenum, 'readability/braces', 4, - "You don't need a ; after a }") + # 4. Block following some statement: + # x = 42; + # {}; + # + # 5. Block at the beginning of a function: + # Function(...) { + # {}; + # } + # + # Note that naively checking for the preceding "{" will also match + # braces inside multi-dimensional arrays, but this is fine since + # that expression will not contain semicolons. + # + # 6. Block following another block: + # while (true) {} + # {}; + # + # 7. End of namespaces: + # namespace {}; + # + # These semicolons seems far more common than other kinds of + # redundant semicolons, possibly due to people converting classes + # to namespaces. For now we do not warn for this case. + # + # Try matching case 1 first. + match = Match(r'^(.*\)\s*)\{', line) + if match: + # Matched closing parenthesis (case 1). Check the token before the + # matching opening parenthesis, and don't warn if it looks like a + # macro. This avoids these false positives: + # - macro that defines a base class + # - multi-line macro that defines a base class + # - macro that defines the whole class-head + # + # But we still issue warnings for macros that we know are safe to + # warn, specifically: + # - TEST, TEST_F, TEST_P, MATCHER, MATCHER_P + # - TYPED_TEST + # - INTERFACE_DEF + # - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED: + # + # We implement a whitelist of safe macros instead of a blacklist of + # unsafe macros, even though the latter appears less frequently in + # google code and would have been easier to implement. This is because + # the downside for getting the whitelist wrong means some extra + # semicolons, while the downside for getting the blacklist wrong + # would result in compile errors. + # + # In addition to macros, we also don't want to warn on + # - Compound literals + # - Lambdas + # - alignas specifier with anonymous structs: + closing_brace_pos = match.group(1).rfind(')') + opening_parenthesis = ReverseCloseExpression(clean_lines, linenum, + closing_brace_pos) + if opening_parenthesis[2] > -1: + line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]] + macro = Search(r'\b([A-Z_]+)\s*$', line_prefix) + func = Match(r'^(.*\])\s*$', line_prefix) + if ((macro and macro.group(1) not in + ('TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST', + 'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED', + 'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or + (func and not Search(r'\boperator\s*\[\s*\]', func.group(1))) or + Search(r'\b(?:struct|union)\s+alignas\s*$', line_prefix) or + Search(r'\s+=\s*$', line_prefix)): + match = None + if (match and opening_parenthesis[1] > 1 and Search( + r'\]\s*$', clean_lines.elided[opening_parenthesis[1] - 1])): + # Multi-line lambda-expression + match = None + + else: + # Try matching cases 2-3. + match = Match(r'^(.*(?:else|\)\s*const)\s*)\{', line) + if not match: + # Try matching cases 4-6. These are always matched on separate lines. + # + # Note that we can't simply concatenate the previous line to the + # current line and do a single match, otherwise we may output + # duplicate warnings for the blank line case: + # if (cond) { + # // blank line + # } + prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] + if prevline and Search(r'[;{}]\s*$', prevline): + match = Match(r'^(\s*)\{', line) + + # Check matching closing brace + if match: + (endline, endlinenum, endpos) = CloseExpression(clean_lines, linenum, + len(match.group(1))) + if endpos > -1 and Match(r'^\s*;', endline[endpos:]): + # Current {} pair is eligible for semicolon check, and we have found + # the redundant semicolon, output warning here. + # + # Note: because we are scanning forward for opening braces, and + # outputting warnings for the matching closing brace, if there are + # nested blocks with trailing semicolons, we will get the error + # messages in reversed order. + error(filename, endlinenum, 'readability/braces', 4, + "You don't need a ; after a }") def CheckEmptyBlockBody(filename, clean_lines, linenum, error): - """Look for empty loop/conditional body with only a single semicolon. + """Look for empty loop/conditional body with only a single semicolon. Args: filename: The name of the current file. @@ -4159,33 +4189,34 @@ def CheckEmptyBlockBody(filename, clean_lines, linenum, error): error: The function to call with any errors found. """ - # Search for loop keywords at the beginning of the line. Because only - # whitespaces are allowed before the keywords, this will also ignore most - # do-while-loops, since those lines should start with closing brace. - # - # We also check "if" blocks here, since an empty conditional block - # is likely an error. - line = clean_lines.elided[linenum] - matched = Match(r'\s*(for|while|if)\s*\(', line) - if matched: - # Find the end of the conditional expression - (end_line, end_linenum, end_pos) = CloseExpression( - clean_lines, linenum, line.find('(')) - - # Output warning if what follows the condition expression is a semicolon. - # No warning for all other cases, including whitespace or newline, since we - # have a separate check for semicolons preceded by whitespace. - if end_pos >= 0 and Match(r';', end_line[end_pos:]): - if matched.group(1) == 'if': - error(filename, end_linenum, 'whitespace/empty_conditional_body', 5, - 'Empty conditional bodies should use {}') - else: - error(filename, end_linenum, 'whitespace/empty_loop_body', 5, - 'Empty loop bodies should use {} or continue') + # Search for loop keywords at the beginning of the line. Because only + # whitespaces are allowed before the keywords, this will also ignore most + # do-while-loops, since those lines should start with closing brace. + # + # We also check "if" blocks here, since an empty conditional block + # is likely an error. + line = clean_lines.elided[linenum] + matched = Match(r'\s*(for|while|if)\s*\(', line) + if matched: + # Find the end of the conditional expression + (end_line, end_linenum, end_pos) = CloseExpression(clean_lines, linenum, + line.find('(')) + + # Output warning if what follows the condition expression is a semicolon. + # No warning for all other cases, including whitespace or newline, since we + # have a separate check for semicolons preceded by whitespace. + if end_pos >= 0 and Match(r';', end_line[end_pos:]): + if matched.group(1) == 'if': + error(filename, end_linenum, + 'whitespace/empty_conditional_body', 5, + 'Empty conditional bodies should use {}') + else: + error(filename, end_linenum, 'whitespace/empty_loop_body', 5, + 'Empty loop bodies should use {} or continue') def FindCheckMacro(line): - """Find a replaceable CHECK-like macro. + """Find a replaceable CHECK-like macro. Args: line: line to search on. @@ -4193,22 +4224,22 @@ def FindCheckMacro(line): (macro name, start position), or (None, -1) if no replaceable macro is found. """ - for macro in _CHECK_MACROS: - i = line.find(macro) - if i >= 0: - # Find opening parenthesis. Do a regular expression match here - # to make sure that we are matching the expected CHECK macro, as - # opposed to some other macro that happens to contain the CHECK - # substring. - matched = Match(r'^(.*\b' + macro + r'\s*)\(', line) - if not matched: - continue - return (macro, len(matched.group(1))) - return (None, -1) + for macro in _CHECK_MACROS: + i = line.find(macro) + if i >= 0: + # Find opening parenthesis. Do a regular expression match here + # to make sure that we are matching the expected CHECK macro, as + # opposed to some other macro that happens to contain the CHECK + # substring. + matched = Match(r'^(.*\b' + macro + r'\s*)\(', line) + if not matched: + continue + return (macro, len(matched.group(1))) + return (None, -1) def CheckCheck(filename, clean_lines, linenum, error): - """Checks the use of CHECK and EXPECT macros. + """Checks the use of CHECK and EXPECT macros. Args: filename: The name of the current file. @@ -4217,116 +4248,116 @@ def CheckCheck(filename, clean_lines, linenum, error): error: The function to call with any errors found. """ - # Decide the set of replacement macros that should be suggested - lines = clean_lines.elided - (check_macro, start_pos) = FindCheckMacro(lines[linenum]) - if not check_macro: - return - - # Find end of the boolean expression by matching parentheses - (last_line, end_line, end_pos) = CloseExpression( - clean_lines, linenum, start_pos) - if end_pos < 0: - return - - # If the check macro is followed by something other than a - # semicolon, assume users will log their own custom error messages - # and don't suggest any replacements. - if not Match(r'\s*;', last_line[end_pos:]): - return - - if linenum == end_line: - expression = lines[linenum][start_pos + 1:end_pos - 1] - else: - expression = lines[linenum][start_pos + 1:] - for i in xrange(linenum + 1, end_line): - expression += lines[i] - expression += last_line[0:end_pos - 1] - - # Parse expression so that we can take parentheses into account. - # This avoids false positives for inputs like "CHECK((a < 4) == b)", - # which is not replaceable by CHECK_LE. - lhs = '' - rhs = '' - operator = None - while expression: - matched = Match(r'^\s*(<<|<<=|>>|>>=|->\*|->|&&|\|\||' - r'==|!=|>=|>|<=|<|\()(.*)$', expression) - if matched: - token = matched.group(1) - if token == '(': - # Parenthesized operand - expression = matched.group(2) - (end, _) = FindEndOfExpressionInLine(expression, 0, ['(']) - if end < 0: - return # Unmatched parenthesis - lhs += '(' + expression[0:end] - expression = expression[end:] - elif token in ('&&', '||'): - # Logical and/or operators. This means the expression - # contains more than one term, for example: - # CHECK(42 < a && a < b); - # - # These are not replaceable with CHECK_LE, so bail out early. + # Decide the set of replacement macros that should be suggested + lines = clean_lines.elided + (check_macro, start_pos) = FindCheckMacro(lines[linenum]) + if not check_macro: return - elif token in ('<<', '<<=', '>>', '>>=', '->*', '->'): - # Non-relational operator - lhs += token - expression = matched.group(2) - else: - # Relational operator - operator = token - rhs = matched.group(2) - break + + # Find end of the boolean expression by matching parentheses + (last_line, end_line, end_pos) = CloseExpression(clean_lines, linenum, + start_pos) + if end_pos < 0: + return + + # If the check macro is followed by something other than a + # semicolon, assume users will log their own custom error messages + # and don't suggest any replacements. + if not Match(r'\s*;', last_line[end_pos:]): + return + + if linenum == end_line: + expression = lines[linenum][start_pos + 1:end_pos - 1] else: - # Unparenthesized operand. Instead of appending to lhs one character - # at a time, we do another regular expression match to consume several - # characters at once if possible. Trivial benchmark shows that this - # is more efficient when the operands are longer than a single - # character, which is generally the case. - matched = Match(r'^([^-=!<>()&|]+)(.*)$', expression) - if not matched: - matched = Match(r'^(\s*\S)(.*)$', expression) - if not matched: - break - lhs += matched.group(1) - expression = matched.group(2) - - # Only apply checks if we got all parts of the boolean expression - if not (lhs and operator and rhs): - return - - # Check that rhs do not contain logical operators. We already know - # that lhs is fine since the loop above parses out && and ||. - if rhs.find('&&') > -1 or rhs.find('||') > -1: - return - - # At least one of the operands must be a constant literal. This is - # to avoid suggesting replacements for unprintable things like - # CHECK(variable != iterator) - # - # The following pattern matches decimal, hex integers, strings, and - # characters (in that order). - lhs = lhs.strip() - rhs = rhs.strip() - match_constant = r'^([-+]?(\d+|0[xX][0-9a-fA-F]+)[lLuU]{0,3}|".*"|\'.*\')$' - if Match(match_constant, lhs) or Match(match_constant, rhs): - # Note: since we know both lhs and rhs, we can provide a more - # descriptive error message like: - # Consider using CHECK_EQ(x, 42) instead of CHECK(x == 42) - # Instead of: - # Consider using CHECK_EQ instead of CHECK(a == b) + expression = lines[linenum][start_pos + 1:] + for i in xrange(linenum + 1, end_line): + expression += lines[i] + expression += last_line[0:end_pos - 1] + + # Parse expression so that we can take parentheses into account. + # This avoids false positives for inputs like "CHECK((a < 4) == b)", + # which is not replaceable by CHECK_LE. + lhs = '' + rhs = '' + operator = None + while expression: + matched = Match(r'^\s*(<<|<<=|>>|>>=|->\*|->|&&|\|\||' + r'==|!=|>=|>|<=|<|\()(.*)$', expression) + if matched: + token = matched.group(1) + if token == '(': + # Parenthesized operand + expression = matched.group(2) + (end, _) = FindEndOfExpressionInLine(expression, 0, ['(']) + if end < 0: + return # Unmatched parenthesis + lhs += '(' + expression[0:end] + expression = expression[end:] + elif token in ('&&', '||'): + # Logical and/or operators. This means the expression + # contains more than one term, for example: + # CHECK(42 < a && a < b); + # + # These are not replaceable with CHECK_LE, so bail out early. + return + elif token in ('<<', '<<=', '>>', '>>=', '->*', '->'): + # Non-relational operator + lhs += token + expression = matched.group(2) + else: + # Relational operator + operator = token + rhs = matched.group(2) + break + else: + # Unparenthesized operand. Instead of appending to lhs one character + # at a time, we do another regular expression match to consume several + # characters at once if possible. Trivial benchmark shows that this + # is more efficient when the operands are longer than a single + # character, which is generally the case. + matched = Match(r'^([^-=!<>()&|]+)(.*)$', expression) + if not matched: + matched = Match(r'^(\s*\S)(.*)$', expression) + if not matched: + break + lhs += matched.group(1) + expression = matched.group(2) + + # Only apply checks if we got all parts of the boolean expression + if not (lhs and operator and rhs): + return + + # Check that rhs do not contain logical operators. We already know + # that lhs is fine since the loop above parses out && and ||. + if rhs.find('&&') > -1 or rhs.find('||') > -1: + return + + # At least one of the operands must be a constant literal. This is + # to avoid suggesting replacements for unprintable things like + # CHECK(variable != iterator) # - # We are still keeping the less descriptive message because if lhs - # or rhs gets long, the error message might become unreadable. - error(filename, linenum, 'readability/check', 2, - 'Consider using %s instead of %s(a %s b)' % ( - _CHECK_REPLACEMENT[check_macro][operator], - check_macro, operator)) + # The following pattern matches decimal, hex integers, strings, and + # characters (in that order). + lhs = lhs.strip() + rhs = rhs.strip() + match_constant = r'^([-+]?(\d+|0[xX][0-9a-fA-F]+)[lLuU]{0,3}|".*"|\'.*\')$' + if Match(match_constant, lhs) or Match(match_constant, rhs): + # Note: since we know both lhs and rhs, we can provide a more + # descriptive error message like: + # Consider using CHECK_EQ(x, 42) instead of CHECK(x == 42) + # Instead of: + # Consider using CHECK_EQ instead of CHECK(a == b) + # + # We are still keeping the less descriptive message because if lhs + # or rhs gets long, the error message might become unreadable. + error(filename, linenum, 'readability/check', 2, + 'Consider using %s instead of %s(a %s b)' % + (_CHECK_REPLACEMENT[check_macro][operator], check_macro, + operator)) def CheckAltTokens(filename, clean_lines, linenum, error): - """Check alternative keywords being used in boolean expressions. + """Check alternative keywords being used in boolean expressions. Args: filename: The name of the current file. @@ -4334,31 +4365,31 @@ def CheckAltTokens(filename, clean_lines, linenum, error): linenum: The number of the line to check. error: The function to call with any errors found. """ - line = clean_lines.elided[linenum] + line = clean_lines.elided[linenum] - # Avoid preprocessor lines - if Match(r'^\s*#', line): - return + # Avoid preprocessor lines + if Match(r'^\s*#', line): + return - # Last ditch effort to avoid multi-line comments. This will not help - # if the comment started before the current line or ended after the - # current line, but it catches most of the false positives. At least, - # it provides a way to workaround this warning for people who use - # multi-line comments in preprocessor macros. - # - # TODO(unknown): remove this once cpplint has better support for - # multi-line comments. - if line.find('/*') >= 0 or line.find('*/') >= 0: - return + # Last ditch effort to avoid multi-line comments. This will not help + # if the comment started before the current line or ended after the + # current line, but it catches most of the false positives. At least, + # it provides a way to workaround this warning for people who use + # multi-line comments in preprocessor macros. + # + # TODO(unknown): remove this once cpplint has better support for + # multi-line comments. + if line.find('/*') >= 0 or line.find('*/') >= 0: + return - for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line): - error(filename, linenum, 'readability/alt_tokens', 2, - 'Use operator %s instead of %s' % ( - _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1))) + for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line): + error(filename, linenum, 'readability/alt_tokens', 2, + 'Use operator %s instead of %s' % ( + _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1))) def GetLineWidth(line): - """Determines the width of the line in column positions. + """Determines the width of the line in column positions. Args: line: A string, which may be a Unicode string. @@ -4367,21 +4398,21 @@ def GetLineWidth(line): The width of the line in column positions, accounting for Unicode combining characters and wide characters. """ - if isinstance(line, unicode): - width = 0 - for uc in unicodedata.normalize('NFC', line): - if unicodedata.east_asian_width(uc) in ('W', 'F'): - width += 2 - elif not unicodedata.combining(uc): - width += 1 - return width - else: - return len(line) + if isinstance(line, unicode): + width = 0 + for uc in unicodedata.normalize('NFC', line): + if unicodedata.east_asian_width(uc) in ('W', 'F'): + width += 2 + elif not unicodedata.combining(uc): + width += 1 + return width + else: + return len(line) def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state, error): - """Checks rules from the 'C++ style rules' section of cppguide.html. + """Checks rules from the 'C++ style rules' section of cppguide.html. Most of these rules are hard to test (naming, comment style), but we do what we can. In particular we check for 2-space indents, line lengths, @@ -4397,105 +4428,105 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state, error: The function to call with any errors found. """ - # Don't use "elided" lines here, otherwise we can't check commented lines. - # Don't want to use "raw" either, because we don't want to check inside C++11 - # raw strings, - raw_lines = clean_lines.lines_without_raw_strings - line = raw_lines[linenum] - - if line.find('\t') != -1: - error(filename, linenum, 'whitespace/tab', 1, - 'Tab found; better to use spaces') - - # One or three blank spaces at the beginning of the line is weird; it's - # hard to reconcile that with 2-space indents. - # NOTE: here are the conditions rob pike used for his tests. Mine aren't - # as sophisticated, but it may be worth becoming so: RLENGTH==initial_spaces - # if(RLENGTH > 20) complain = 0; - # if(match($0, " +(error|private|public|protected):")) complain = 0; - # if(match(prev, "&& *$")) complain = 0; - # if(match(prev, "\\|\\| *$")) complain = 0; - # if(match(prev, "[\",=><] *$")) complain = 0; - # if(match($0, " <<")) complain = 0; - # if(match(prev, " +for \\(")) complain = 0; - # if(prevodd && match(prevprev, " +for \\(")) complain = 0; - scope_or_label_pattern = r'\s*\w+\s*:\s*\\?$' - classinfo = nesting_state.InnermostClass() - initial_spaces = 0 - cleansed_line = clean_lines.elided[linenum] - while initial_spaces < len(line) and line[initial_spaces] == ' ': - initial_spaces += 1 - if line and line[-1].isspace(): - error(filename, linenum, 'whitespace/end_of_line', 4, - 'Line ends in whitespace. Consider deleting these extra spaces.') - # There are certain situations we allow one space, notably for - # section labels, and also lines containing multi-line raw strings. - elif ((initial_spaces == 1 or initial_spaces == 3) and - not Match(scope_or_label_pattern, cleansed_line) and - not (clean_lines.raw_lines[linenum] != line and - Match(r'^\s*""', line))): - error(filename, linenum, 'whitespace/indent', 3, - 'Weird number of spaces at line-start. ' - 'Are you using a 2-space indent?') - - # Check if the line is a header guard. - is_header_guard = False - if file_extension == 'h': - cppvar = GetHeaderGuardCPPVariable(filename) - if (line.startswith('#ifndef %s' % cppvar) or - line.startswith('#define %s' % cppvar) or - line.startswith('#endif // %s' % cppvar)): - is_header_guard = True - # #include lines and header guards can be long, since there's no clean way to - # split them. - # - # URLs can be long too. It's possible to split these, but it makes them - # harder to cut&paste. - # - # The "$Id:...$" comment may also get very long without it being the - # developers fault. - if (not line.startswith('#include') and not is_header_guard and - not Match(r'^\s*//.*http(s?)://\S*$', line) and - not Match(r'^// \$Id:.*#[0-9]+ \$$', line)): - line_width = GetLineWidth(line) - extended_length = int((_line_length * 1.25)) - if line_width > extended_length: - error(filename, linenum, 'whitespace/line_length', 4, - 'Lines should very rarely be longer than %i characters' % - extended_length) - elif line_width > _line_length: - error(filename, linenum, 'whitespace/line_length', 2, - 'Lines should be <= %i characters long' % _line_length) - - if (cleansed_line.count(';') > 1 and - # for loops are allowed two ;'s (and may run over two lines). - cleansed_line.find('for') == -1 and - (GetPreviousNonBlankLine(clean_lines, linenum)[0].find('for') == -1 or - GetPreviousNonBlankLine(clean_lines, linenum)[0].find(';') != -1) and - # It's ok to have many commands in a switch case that fits in 1 line - not ((cleansed_line.find('case ') != -1 or - cleansed_line.find('default:') != -1) and - cleansed_line.find('break;') != -1)): - error(filename, linenum, 'whitespace/newline', 0, - 'More than one command on the same line') - - # Some more style checks - CheckBraces(filename, clean_lines, linenum, error) - CheckTrailingSemicolon(filename, clean_lines, linenum, error) - CheckEmptyBlockBody(filename, clean_lines, linenum, error) - CheckAccess(filename, clean_lines, linenum, nesting_state, error) - CheckSpacing(filename, clean_lines, linenum, nesting_state, error) - CheckOperatorSpacing(filename, clean_lines, linenum, error) - CheckParenthesisSpacing(filename, clean_lines, linenum, error) - CheckCommaSpacing(filename, clean_lines, linenum, error) - CheckBracesSpacing(filename, clean_lines, linenum, error) - CheckSpacingForFunctionCall(filename, clean_lines, linenum, error) - CheckRValueReference(filename, clean_lines, linenum, nesting_state, error) - CheckCheck(filename, clean_lines, linenum, error) - CheckAltTokens(filename, clean_lines, linenum, error) - classinfo = nesting_state.InnermostClass() - if classinfo: - CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error) + # Don't use "elided" lines here, otherwise we can't check commented lines. + # Don't want to use "raw" either, because we don't want to check inside C++11 + # raw strings, + raw_lines = clean_lines.lines_without_raw_strings + line = raw_lines[linenum] + + if line.find('\t') != -1: + error(filename, linenum, 'whitespace/tab', 1, + 'Tab found; better to use spaces') + + # One or three blank spaces at the beginning of the line is weird; it's + # hard to reconcile that with 2-space indents. + # NOTE: here are the conditions rob pike used for his tests. Mine aren't + # as sophisticated, but it may be worth becoming so: RLENGTH==initial_spaces + # if(RLENGTH > 20) complain = 0; + # if(match($0, " +(error|private|public|protected):")) complain = 0; + # if(match(prev, "&& *$")) complain = 0; + # if(match(prev, "\\|\\| *$")) complain = 0; + # if(match(prev, "[\",=><] *$")) complain = 0; + # if(match($0, " <<")) complain = 0; + # if(match(prev, " +for \\(")) complain = 0; + # if(prevodd && match(prevprev, " +for \\(")) complain = 0; + scope_or_label_pattern = r'\s*\w+\s*:\s*\\?$' + classinfo = nesting_state.InnermostClass() + initial_spaces = 0 + cleansed_line = clean_lines.elided[linenum] + while initial_spaces < len(line) and line[initial_spaces] == ' ': + initial_spaces += 1 + if line and line[-1].isspace(): + error(filename, linenum, 'whitespace/end_of_line', 4, + 'Line ends in whitespace. Consider deleting these extra spaces.') + # There are certain situations we allow one space, notably for + # section labels, and also lines containing multi-line raw strings. + elif ((initial_spaces == 1 or initial_spaces == 3) and + not Match(scope_or_label_pattern, cleansed_line) and + not (clean_lines.raw_lines[linenum] != line and + Match(r'^\s*""', line))): + error(filename, linenum, 'whitespace/indent', 3, + 'Weird number of spaces at line-start. ' + 'Are you using a 2-space indent?') + + # Check if the line is a header guard. + is_header_guard = False + if file_extension == 'h': + cppvar = GetHeaderGuardCPPVariable(filename) + if (line.startswith('#ifndef %s' % cppvar) or + line.startswith('#define %s' % cppvar) or + line.startswith('#endif // %s' % cppvar)): + is_header_guard = True + # #include lines and header guards can be long, since there's no clean way to + # split them. + # + # URLs can be long too. It's possible to split these, but it makes them + # harder to cut&paste. + # + # The "$Id:...$" comment may also get very long without it being the + # developers fault. + if (not line.startswith('#include') and not is_header_guard and + not Match(r'^\s*//.*http(s?)://\S*$', line) and + not Match(r'^// \$Id:.*#[0-9]+ \$$', line)): + line_width = GetLineWidth(line) + extended_length = int((_line_length * 1.25)) + if line_width > extended_length: + error(filename, linenum, 'whitespace/line_length', 4, + 'Lines should very rarely be longer than %i characters' % + extended_length) + elif line_width > _line_length: + error(filename, linenum, 'whitespace/line_length', 2, + 'Lines should be <= %i characters long' % _line_length) + + if (cleansed_line.count(';') > 1 and + # for loops are allowed two ;'s (and may run over two lines). + cleansed_line.find('for') == -1 and + (GetPreviousNonBlankLine(clean_lines, linenum)[0].find('for') == -1 or + GetPreviousNonBlankLine(clean_lines, linenum)[0].find(';') != -1) and + # It's ok to have many commands in a switch case that fits in 1 line + not ((cleansed_line.find('case ') != -1 or + cleansed_line.find('default:') != -1) and + cleansed_line.find('break;') != -1)): + error(filename, linenum, 'whitespace/newline', 0, + 'More than one command on the same line') + + # Some more style checks + CheckBraces(filename, clean_lines, linenum, error) + CheckTrailingSemicolon(filename, clean_lines, linenum, error) + CheckEmptyBlockBody(filename, clean_lines, linenum, error) + CheckAccess(filename, clean_lines, linenum, nesting_state, error) + CheckSpacing(filename, clean_lines, linenum, nesting_state, error) + CheckOperatorSpacing(filename, clean_lines, linenum, error) + CheckParenthesisSpacing(filename, clean_lines, linenum, error) + CheckCommaSpacing(filename, clean_lines, linenum, error) + CheckBracesSpacing(filename, clean_lines, linenum, error) + CheckSpacingForFunctionCall(filename, clean_lines, linenum, error) + CheckRValueReference(filename, clean_lines, linenum, nesting_state, error) + CheckCheck(filename, clean_lines, linenum, error) + CheckAltTokens(filename, clean_lines, linenum, error) + classinfo = nesting_state.InnermostClass() + if classinfo: + CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error) _RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$') @@ -4508,7 +4539,7 @@ _RE_FIRST_COMPONENT = re.compile(r'^[^-_.]+') def _DropCommonSuffixes(filename): - """Drops common suffixes like _test.cc or -inl.h from filename. + """Drops common suffixes like _test.cc or -inl.h from filename. For example: >>> _DropCommonSuffixes('foo/foo-inl.h') @@ -4526,16 +4557,16 @@ def _DropCommonSuffixes(filename): Returns: The filename with the common suffix removed. """ - for suffix in ('test.cc', 'regtest.cc', 'unittest.cc', - 'inl.h', 'impl.h', 'internal.h'): - if (filename.endswith(suffix) and len(filename) > len(suffix) and - filename[-len(suffix) - 1] in ('-', '_')): - return filename[:-len(suffix) - 1] - return os.path.splitext(filename)[0] + for suffix in ('test.cc', 'regtest.cc', 'unittest.cc', 'inl.h', 'impl.h', + 'internal.h'): + if (filename.endswith(suffix) and len(filename) > len(suffix) and + filename[-len(suffix) - 1] in ('-', '_')): + return filename[:-len(suffix) - 1] + return os.path.splitext(filename)[0] def _IsTestFilename(filename): - """Determines if the given filename has a suffix that identifies it as a test. + """Determines if the given filename has a suffix that identifies it as a test. Args: filename: The input filename. @@ -4543,16 +4574,15 @@ def _IsTestFilename(filename): Returns: True if 'filename' looks like a test, False otherwise. """ - if (filename.endswith('_test.cc') or - filename.endswith('_unittest.cc') or - filename.endswith('_regtest.cc')): - return True - else: - return False + if (filename.endswith('_test.cc') or filename.endswith('_unittest.cc') or + filename.endswith('_regtest.cc')): + return True + else: + return False def _ClassifyInclude(fileinfo, include, is_system): - """Figures out what kind of header 'include' is. + """Figures out what kind of header 'include' is. Args: fileinfo: The current file cpplint is running over. A FileInfo instance. @@ -4575,44 +4605,43 @@ def _ClassifyInclude(fileinfo, include, is_system): >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/bar.h', False) _OTHER_HEADER """ - # This is a list of all standard c++ header files, except - # those already checked for above. - is_cpp_h = include in _CPP_HEADERS - - if is_system: - if is_cpp_h: - return _CPP_SYS_HEADER - else: - return _C_SYS_HEADER - - # If the target file and the include we're checking share a - # basename when we drop common extensions, and the include - # lives in . , then it's likely to be owned by the target file. - target_dir, target_base = ( - os.path.split(_DropCommonSuffixes(fileinfo.RepositoryName()))) - include_dir, include_base = os.path.split(_DropCommonSuffixes(include)) - if target_base == include_base and ( - include_dir == target_dir or - include_dir == os.path.normpath(target_dir + '/../public')): - return _LIKELY_MY_HEADER - - # If the target and include share some initial basename - # component, it's possible the target is implementing the - # include, so it's allowed to be first, but we'll never - # complain if it's not there. - target_first_component = _RE_FIRST_COMPONENT.match(target_base) - include_first_component = _RE_FIRST_COMPONENT.match(include_base) - if (target_first_component and include_first_component and - target_first_component.group(0) == - include_first_component.group(0)): - return _POSSIBLE_MY_HEADER - - return _OTHER_HEADER + # This is a list of all standard c++ header files, except + # those already checked for above. + is_cpp_h = include in _CPP_HEADERS + if is_system: + if is_cpp_h: + return _CPP_SYS_HEADER + else: + return _C_SYS_HEADER + + # If the target file and the include we're checking share a + # basename when we drop common extensions, and the include + # lives in . , then it's likely to be owned by the target file. + target_dir, target_base = ( + os.path.split(_DropCommonSuffixes(fileinfo.RepositoryName()))) + include_dir, include_base = os.path.split(_DropCommonSuffixes(include)) + if target_base == include_base and ( + include_dir == target_dir or + include_dir == os.path.normpath(target_dir + '/../public')): + return _LIKELY_MY_HEADER + + # If the target and include share some initial basename + # component, it's possible the target is implementing the + # include, so it's allowed to be first, but we'll never + # complain if it's not there. + target_first_component = _RE_FIRST_COMPONENT.match(target_base) + include_first_component = _RE_FIRST_COMPONENT.match(include_base) + if (target_first_component and include_first_component and + target_first_component.group(0) == + include_first_component.group(0)): + return _POSSIBLE_MY_HEADER + + return _OTHER_HEADER def CheckIncludeLine(filename, clean_lines, linenum, include_state, error): - """Check rules that are applicable to #include lines. + """Check rules that are applicable to #include lines. Strings on #include lines are NOT removed from elided line, to make certain tasks easier. However, to prevent false positives, checks @@ -4625,68 +4654,69 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error): include_state: An _IncludeState instance in which the headers are inserted. error: The function to call with any errors found. """ - fileinfo = FileInfo(filename) - line = clean_lines.lines[linenum] - - # "include" should use the new style "foo/bar.h" instead of just "bar.h" - # Only do this check if the included header follows google naming - # conventions. If not, assume that it's a 3rd party API that - # requires special include conventions. - # - # We also make an exception for Lua headers, which follow google - # naming convention but not the include convention. - match = Match(r'#include\s*"([^/]+\.h)"', line) - if match and not _THIRD_PARTY_HEADERS_PATTERN.match(match.group(1)): - error(filename, linenum, 'build/include', 4, - 'Include the directory when naming .h files') - - # we shouldn't include a file more than once. actually, there are a - # handful of instances where doing so is okay, but in general it's - # not. - match = _RE_PATTERN_INCLUDE.search(line) - if match: - include = match.group(2) - is_system = (match.group(1) == '<') - duplicate_line = include_state.FindHeader(include) - if duplicate_line >= 0: - error(filename, linenum, 'build/include', 4, - '"%s" already included at %s:%s' % - (include, filename, duplicate_line)) - elif (include.endswith('.cc') and - os.path.dirname(fileinfo.RepositoryName()) != os.path.dirname(include)): - error(filename, linenum, 'build/include', 4, - 'Do not include .cc files from other packages') - elif not _THIRD_PARTY_HEADERS_PATTERN.match(include): - include_state.include_list[-1].append((include, linenum)) - - # We want to ensure that headers appear in the right order: - # 1) for foo.cc, foo.h (preferred location) - # 2) c system files - # 3) cpp system files - # 4) for foo.cc, foo.h (deprecated location) - # 5) other google headers - # - # We classify each include statement as one of those 5 types - # using a number of techniques. The include_state object keeps - # track of the highest type seen, and complains if we see a - # lower type after that. - error_message = include_state.CheckNextIncludeOrder( - _ClassifyInclude(fileinfo, include, is_system)) - if error_message: - error(filename, linenum, 'build/include_order', 4, - '%s. Should be: %s.h, c system, c++ system, other.' % - (error_message, fileinfo.BaseName())) - canonical_include = include_state.CanonicalizeAlphabeticalOrder(include) - if not include_state.IsInAlphabeticalOrder( - clean_lines, linenum, canonical_include): - error(filename, linenum, 'build/include_alpha', 4, - 'Include "%s" not in alphabetical order' % include) - include_state.SetLastHeader(canonical_include) + fileinfo = FileInfo(filename) + line = clean_lines.lines[linenum] + # "include" should use the new style "foo/bar.h" instead of just "bar.h" + # Only do this check if the included header follows google naming + # conventions. If not, assume that it's a 3rd party API that + # requires special include conventions. + # + # We also make an exception for Lua headers, which follow google + # naming convention but not the include convention. + match = Match(r'#include\s*"([^/]+\.h)"', line) + if match and not _THIRD_PARTY_HEADERS_PATTERN.match(match.group(1)): + error(filename, linenum, 'build/include', 4, + 'Include the directory when naming .h files') + + # we shouldn't include a file more than once. actually, there are a + # handful of instances where doing so is okay, but in general it's + # not. + match = _RE_PATTERN_INCLUDE.search(line) + if match: + include = match.group(2) + is_system = (match.group(1) == '<') + duplicate_line = include_state.FindHeader(include) + if duplicate_line >= 0: + error(filename, linenum, 'build/include', 4, + '"%s" already included at %s:%s' % + (include, filename, duplicate_line)) + elif (include.endswith('.cc') and + os.path.dirname(fileinfo.RepositoryName()) != + os.path.dirname(include)): + error(filename, linenum, 'build/include', 4, + 'Do not include .cc files from other packages') + elif not _THIRD_PARTY_HEADERS_PATTERN.match(include): + include_state.include_list[-1].append((include, linenum)) + + # We want to ensure that headers appear in the right order: + # 1) for foo.cc, foo.h (preferred location) + # 2) c system files + # 3) cpp system files + # 4) for foo.cc, foo.h (deprecated location) + # 5) other google headers + # + # We classify each include statement as one of those 5 types + # using a number of techniques. The include_state object keeps + # track of the highest type seen, and complains if we see a + # lower type after that. + error_message = include_state.CheckNextIncludeOrder( + _ClassifyInclude(fileinfo, include, is_system)) + if error_message: + error(filename, linenum, 'build/include_order', 4, + '%s. Should be: %s.h, c system, c++ system, other.' % + (error_message, fileinfo.BaseName())) + canonical_include = include_state.CanonicalizeAlphabeticalOrder( + include) + if not include_state.IsInAlphabeticalOrder(clean_lines, linenum, + canonical_include): + error(filename, linenum, 'build/include_alpha', 4, + 'Include "%s" not in alphabetical order' % include) + include_state.SetLastHeader(canonical_include) def _GetTextInside(text, start_pattern): - r"""Retrieves all the text between matching open and close parentheses. + r"""Retrieves all the text between matching open and close parentheses. Given a string of lines and a regular expression string, retrieve all the text following the expression and between opening punctuation symbols like @@ -4705,40 +4735,40 @@ def _GetTextInside(text, start_pattern): The extracted text. None if either the opening string or ending punctuation could not be found. """ - # TODO(unknown): Audit cpplint.py to see what places could be profitably - # rewritten to use _GetTextInside (and use inferior regexp matching today). - - # Give opening punctuations to get the matching close-punctuations. - matching_punctuation = {'(': ')', '{': '}', '[': ']'} - closing_punctuation = set(matching_punctuation.itervalues()) - - # Find the position to start extracting text. - match = re.search(start_pattern, text, re.M) - if not match: # start_pattern not found in text. - return None - start_position = match.end(0) - - assert start_position > 0, ( - 'start_pattern must ends with an opening punctuation.') - assert text[start_position - 1] in matching_punctuation, ( - 'start_pattern must ends with an opening punctuation.') - # Stack of closing punctuations we expect to have in text after position. - punctuation_stack = [matching_punctuation[text[start_position - 1]]] - position = start_position - while punctuation_stack and position < len(text): - if text[position] == punctuation_stack[-1]: - punctuation_stack.pop() - elif text[position] in closing_punctuation: - # A closing punctuation without matching opening punctuations. - return None - elif text[position] in matching_punctuation: - punctuation_stack.append(matching_punctuation[text[position]]) - position += 1 - if punctuation_stack: - # Opening punctuations left without matching close-punctuations. - return None - # punctuations match. - return text[start_position:position - 1] + # TODO(unknown): Audit cpplint.py to see what places could be profitably + # rewritten to use _GetTextInside (and use inferior regexp matching today). + + # Give opening punctuations to get the matching close-punctuations. + matching_punctuation = {'(': ')', '{': '}', '[': ']'} + closing_punctuation = set(matching_punctuation.itervalues()) + + # Find the position to start extracting text. + match = re.search(start_pattern, text, re.M) + if not match: # start_pattern not found in text. + return None + start_position = match.end(0) + + assert start_position > 0, ( + 'start_pattern must ends with an opening punctuation.') + assert text[start_position - 1] in matching_punctuation, ( + 'start_pattern must ends with an opening punctuation.') + # Stack of closing punctuations we expect to have in text after position. + punctuation_stack = [matching_punctuation[text[start_position - 1]]] + position = start_position + while punctuation_stack and position < len(text): + if text[position] == punctuation_stack[-1]: + punctuation_stack.pop() + elif text[position] in closing_punctuation: + # A closing punctuation without matching opening punctuations. + return None + elif text[position] in matching_punctuation: + punctuation_stack.append(matching_punctuation[text[position]]) + position += 1 + if punctuation_stack: + # Opening punctuations left without matching close-punctuations. + return None + # punctuations match. + return text[start_position:position - 1] # Patterns for matching call-by-reference parameters. @@ -4763,13 +4793,13 @@ _RE_PATTERN_REF_PARAM = re.compile( # A call-by-const-reference parameter either ends with 'const& identifier' # or looks like 'const type& identifier' when 'type' is atomic. _RE_PATTERN_CONST_REF_PARAM = ( - r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT + - r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')') + r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT + r'|const\s+' + + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')') -def CheckLanguage(filename, clean_lines, linenum, file_extension, - include_state, nesting_state, error): - """Checks rules from the 'C++ language rules' section of cppguide.html. +def CheckLanguage(filename, clean_lines, linenum, file_extension, include_state, + nesting_state, error): + """Checks rules from the 'C++ language rules' section of cppguide.html. Some of these rules are hard to test (function overloading, using uint32 inappropriately), but we do the best we can. @@ -4784,149 +4814,152 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension, the current stack of nested blocks being parsed. error: The function to call with any errors found. """ - # If the line is empty or consists of entirely a comment, no need to - # check it. - line = clean_lines.elided[linenum] - if not line: - return - - match = _RE_PATTERN_INCLUDE.search(line) - if match: - CheckIncludeLine(filename, clean_lines, linenum, include_state, error) - return - - # Reset include state across preprocessor directives. This is meant - # to silence warnings for conditional includes. - match = Match(r'^\s*#\s*(if|ifdef|ifndef|elif|else|endif)\b', line) - if match: - include_state.ResetSection(match.group(1)) - - # Make Windows paths like Unix. - fullname = os.path.abspath(filename).replace('\\', '/') - - # Perform other checks now that we are sure that this is not an include line - CheckCasts(filename, clean_lines, linenum, error) - CheckGlobalStatic(filename, clean_lines, linenum, error) - CheckPrintf(filename, clean_lines, linenum, error) - - if file_extension == 'h': - # TODO(unknown): check that 1-arg constructors are explicit. - # How to tell it's a constructor? - # (handled in CheckForNonStandardConstructs for now) - # TODO(unknown): check that classes declare or disable copy/assign - # (level 1 error) - pass + # If the line is empty or consists of entirely a comment, no need to + # check it. + line = clean_lines.elided[linenum] + if not line: + return - # Check if people are using the verboten C basic types. The only exception - # we regularly allow is "unsigned short port" for port. - if Search(r'\bshort port\b', line): - if not Search(r'\bunsigned short port\b', line): - error(filename, linenum, 'runtime/int', 4, - 'Use "unsigned short" for ports, not "short"') - else: - match = Search(r'\b(short|long(?! +double)|long long)\b', line) + match = _RE_PATTERN_INCLUDE.search(line) if match: - error(filename, linenum, 'runtime/int', 4, - 'Use int16/int64/etc, rather than the C type %s' % match.group(1)) - - # Check if some verboten operator overloading is going on - # TODO(unknown): catch out-of-line unary operator&: - # class X {}; - # int operator&(const X& x) { return 42; } // unary operator& - # The trick is it's hard to tell apart from binary operator&: - # class Y { int operator&(const Y& x) { return 23; } }; // binary operator& - if Search(r'\boperator\s*&\s*\(\s*\)', line): - error(filename, linenum, 'runtime/operator', 4, - 'Unary operator& is dangerous. Do not use it.') - - # Check for suspicious usage of "if" like - # } if (a == b) { - if Search(r'\}\s*if\s*\(', line): - error(filename, linenum, 'readability/braces', 4, - 'Did you mean "else if"? If not, start a new line for "if".') - - # Check for potential format string bugs like printf(foo). - # We constrain the pattern not to pick things like DocidForPrintf(foo). - # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str()) - # TODO(unknown): Catch the following case. Need to change the calling - # convention of the whole function to process multiple line to handle it. - # printf( - # boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line); - printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(') - if printf_args: - match = Match(r'([\w.\->()]+)$', printf_args) - if match and match.group(1) != '__VA_ARGS__': - function_name = re.search(r'\b((?:string)?printf)\s*\(', - line, re.I).group(1) - error(filename, linenum, 'runtime/printf', 4, - 'Potential format string bug. Do %s("%%s", %s) instead.' - % (function_name, match.group(1))) - - # Check for potential memset bugs like memset(buf, sizeof(buf), 0). - match = Search(r'memset\s*\(([^,]*),\s*([^,]*),\s*0\s*\)', line) - if match and not Match(r"^''|-?[0-9]+|0x[0-9A-Fa-f]$", match.group(2)): - error(filename, linenum, 'runtime/memset', 4, - 'Did you mean "memset(%s, 0, %s)"?' - % (match.group(1), match.group(2))) - - if Search(r'\busing namespace\b', line): - error(filename, linenum, 'build/namespaces', 5, - 'Do not use namespace using-directives. ' - 'Use using-declarations instead.') - - # Detect variable-length arrays. - match = Match(r'\s*(.+::)?(\w+) [a-z]\w*\[(.+)];', line) - if (match and match.group(2) != 'return' and match.group(2) != 'delete' and - match.group(3).find(']') == -1): - # Split the size using space and arithmetic operators as delimiters. - # If any of the resulting tokens are not compile time constants then - # report the error. - tokens = re.split(r'\s|\+|\-|\*|\/|<<|>>]', match.group(3)) - is_const = True - skip_next = False - for tok in tokens: - if skip_next: + CheckIncludeLine(filename, clean_lines, linenum, include_state, error) + return + + # Reset include state across preprocessor directives. This is meant + # to silence warnings for conditional includes. + match = Match(r'^\s*#\s*(if|ifdef|ifndef|elif|else|endif)\b', line) + if match: + include_state.ResetSection(match.group(1)) + + # Make Windows paths like Unix. + fullname = os.path.abspath(filename).replace('\\', '/') + + # Perform other checks now that we are sure that this is not an include line + CheckCasts(filename, clean_lines, linenum, error) + CheckGlobalStatic(filename, clean_lines, linenum, error) + CheckPrintf(filename, clean_lines, linenum, error) + + if file_extension == 'h': + # TODO(unknown): check that 1-arg constructors are explicit. + # How to tell it's a constructor? + # (handled in CheckForNonStandardConstructs for now) + # TODO(unknown): check that classes declare or disable copy/assign + # (level 1 error) + pass + + # Check if people are using the verboten C basic types. The only exception + # we regularly allow is "unsigned short port" for port. + if Search(r'\bshort port\b', line): + if not Search(r'\bunsigned short port\b', line): + error(filename, linenum, 'runtime/int', 4, + 'Use "unsigned short" for ports, not "short"') + else: + match = Search(r'\b(short|long(?! +double)|long long)\b', line) + if match: + error(filename, linenum, 'runtime/int', 4, + 'Use int16/int64/etc, rather than the C type %s' % + match.group(1)) + + # Check if some verboten operator overloading is going on + # TODO(unknown): catch out-of-line unary operator&: + # class X {}; + # int operator&(const X& x) { return 42; } // unary operator& + # The trick is it's hard to tell apart from binary operator&: + # class Y { int operator&(const Y& x) { return 23; } }; // binary operator& + if Search(r'\boperator\s*&\s*\(\s*\)', line): + error(filename, linenum, 'runtime/operator', 4, + 'Unary operator& is dangerous. Do not use it.') + + # Check for suspicious usage of "if" like + # } if (a == b) { + if Search(r'\}\s*if\s*\(', line): + error(filename, linenum, 'readability/braces', 4, + 'Did you mean "else if"? If not, start a new line for "if".') + + # Check for potential format string bugs like printf(foo). + # We constrain the pattern not to pick things like DocidForPrintf(foo). + # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str()) + # TODO(unknown): Catch the following case. Need to change the calling + # convention of the whole function to process multiple line to handle it. + # printf( + # boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line); + printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(') + if printf_args: + match = Match(r'([\w.\->()]+)$', printf_args) + if match and match.group(1) != '__VA_ARGS__': + function_name = re.search(r'\b((?:string)?printf)\s*\(', line, + re.I).group(1) + error(filename, linenum, 'runtime/printf', 4, + 'Potential format string bug. Do %s("%%s", %s) instead.' % + (function_name, match.group(1))) + + # Check for potential memset bugs like memset(buf, sizeof(buf), 0). + match = Search(r'memset\s*\(([^,]*),\s*([^,]*),\s*0\s*\)', line) + if match and not Match(r"^''|-?[0-9]+|0x[0-9A-Fa-f]$", match.group(2)): + error(filename, linenum, 'runtime/memset', 4, + 'Did you mean "memset(%s, 0, %s)"?' % + (match.group(1), match.group(2))) + + if Search(r'\busing namespace\b', line): + error(filename, linenum, 'build/namespaces', 5, + 'Do not use namespace using-directives. ' + 'Use using-declarations instead.') + + # Detect variable-length arrays. + match = Match(r'\s*(.+::)?(\w+) [a-z]\w*\[(.+)];', line) + if (match and match.group(2) != 'return' and match.group(2) != 'delete' and + match.group(3).find(']') == -1): + # Split the size using space and arithmetic operators as delimiters. + # If any of the resulting tokens are not compile time constants then + # report the error. + tokens = re.split(r'\s|\+|\-|\*|\/|<<|>>]', match.group(3)) + is_const = True skip_next = False - continue - - if Search(r'sizeof\(.+\)', tok): continue - if Search(r'arraysize\(\w+\)', tok): continue - - tok = tok.lstrip('(') - tok = tok.rstrip(')') - if not tok: continue - if Match(r'\d+', tok): continue - if Match(r'0[xX][0-9a-fA-F]+', tok): continue - if Match(r'k[A-Z0-9]\w*', tok): continue - if Match(r'(.+::)?k[A-Z0-9]\w*', tok): continue - if Match(r'(.+::)?[A-Z][A-Z0-9_]*', tok): continue - # A catch all for tricky sizeof cases, including 'sizeof expression', - # 'sizeof(*type)', 'sizeof(const type)', 'sizeof(struct StructName)' - # requires skipping the next token because we split on ' ' and '*'. - if tok.startswith('sizeof'): - skip_next = True - continue - is_const = False - break - if not is_const: - error(filename, linenum, 'runtime/arrays', 1, - 'Do not use variable-length arrays. Use an appropriately named ' - "('k' followed by CamelCase) compile-time constant for the size.") - - # Check for use of unnamed namespaces in header files. Registration - # macros are typically OK, so we allow use of "namespace {" on lines - # that end with backslashes. - if (file_extension == 'h' - and Search(r'\bnamespace\s*{', line) - and line[-1] != '\\'): - error(filename, linenum, 'build/namespaces', 4, - 'Do not use unnamed namespaces in header files. See ' - 'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces' - ' for more information.') + for tok in tokens: + if skip_next: + skip_next = False + continue + + if Search(r'sizeof\(.+\)', tok): continue + if Search(r'arraysize\(\w+\)', tok): continue + + tok = tok.lstrip('(') + tok = tok.rstrip(')') + if not tok: continue + if Match(r'\d+', tok): continue + if Match(r'0[xX][0-9a-fA-F]+', tok): continue + if Match(r'k[A-Z0-9]\w*', tok): continue + if Match(r'(.+::)?k[A-Z0-9]\w*', tok): continue + if Match(r'(.+::)?[A-Z][A-Z0-9_]*', tok): continue + # A catch all for tricky sizeof cases, including 'sizeof expression', + # 'sizeof(*type)', 'sizeof(const type)', 'sizeof(struct StructName)' + # requires skipping the next token because we split on ' ' and '*'. + if tok.startswith('sizeof'): + skip_next = True + continue + is_const = False + break + if not is_const: + error( + filename, linenum, 'runtime/arrays', 1, + 'Do not use variable-length arrays. Use an appropriately named ' + "('k' followed by CamelCase) compile-time constant for the size." + ) + + # Check for use of unnamed namespaces in header files. Registration + # macros are typically OK, so we allow use of "namespace {" on lines + # that end with backslashes. + if (file_extension == 'h' and Search(r'\bnamespace\s*{', line) and + line[-1] != '\\'): + error( + filename, linenum, 'build/namespaces', 4, + 'Do not use unnamed namespaces in header files. See ' + 'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces' + ' for more information.') def CheckGlobalStatic(filename, clean_lines, linenum, error): - """Check for unsafe global or static objects. + """Check for unsafe global or static objects. Args: filename: The name of the current file. @@ -4934,51 +4967,50 @@ def CheckGlobalStatic(filename, clean_lines, linenum, error): linenum: The number of the line to check. error: The function to call with any errors found. """ - line = clean_lines.elided[linenum] - - # Match two lines at a time to support multiline declarations - if linenum + 1 < clean_lines.NumLines() and not Search(r'[;({]', line): - line += clean_lines.elided[linenum + 1].strip() - - # Check for people declaring static/global STL strings at the top level. - # This is dangerous because the C++ language does not guarantee that - # globals with constructors are initialized before the first access. - match = Match( - r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)', - line) - - # Remove false positives: - # - String pointers (as opposed to values). - # string *pointer - # const string *pointer - # string const *pointer - # string *const pointer - # - # - Functions and template specializations. - # string Function(... - # string Class::Method(... - # - # - Operators. These are matched separately because operator names - # cross non-word boundaries, and trying to match both operators - # and functions at the same time would decrease accuracy of - # matching identifiers. - # string Class::operator*() - if (match and - not Search(r'\bstring\b(\s+const)?\s*\*\s*(const\s+)?\w', line) and - not Search(r'\boperator\W', line) and - not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)*\s*\(([^"]|$)', match.group(3))): - error(filename, linenum, 'runtime/string', 4, - 'For a static/global string constant, use a C style string instead: ' - '"%schar %s[]".' % - (match.group(1), match.group(2))) - - if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line): - error(filename, linenum, 'runtime/init', 4, - 'You seem to be initializing a member variable with itself.') + line = clean_lines.elided[linenum] + + # Match two lines at a time to support multiline declarations + if linenum + 1 < clean_lines.NumLines() and not Search(r'[;({]', line): + line += clean_lines.elided[linenum + 1].strip() + + # Check for people declaring static/global STL strings at the top level. + # This is dangerous because the C++ language does not guarantee that + # globals with constructors are initialized before the first access. + match = Match(r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)', + line) + + # Remove false positives: + # - String pointers (as opposed to values). + # string *pointer + # const string *pointer + # string const *pointer + # string *const pointer + # + # - Functions and template specializations. + # string Function(... + # string Class::Method(... + # + # - Operators. These are matched separately because operator names + # cross non-word boundaries, and trying to match both operators + # and functions at the same time would decrease accuracy of + # matching identifiers. + # string Class::operator*() + if (match and + not Search(r'\bstring\b(\s+const)?\s*\*\s*(const\s+)?\w', line) and + not Search(r'\boperator\W', line) and not Match( + r'\s*(<.*>)?(::[a-zA-Z0-9_]+)*\s*\(([^"]|$)', match.group(3))): + error( + filename, linenum, 'runtime/string', 4, + 'For a static/global string constant, use a C style string instead: ' + '"%schar %s[]".' % (match.group(1), match.group(2))) + + if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line): + error(filename, linenum, 'runtime/init', 4, + 'You seem to be initializing a member variable with itself.') def CheckPrintf(filename, clean_lines, linenum, error): - """Check for printf related issues. + """Check for printf related issues. Args: filename: The name of the current file. @@ -4986,28 +5018,28 @@ def CheckPrintf(filename, clean_lines, linenum, error): linenum: The number of the line to check. error: The function to call with any errors found. """ - line = clean_lines.elided[linenum] - - # When snprintf is used, the second argument shouldn't be a literal. - match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line) - if match and match.group(2) != '0': - # If 2nd arg is zero, snprintf is used to calculate size. - error(filename, linenum, 'runtime/printf', 3, - 'If you can, use sizeof(%s) instead of %s as the 2nd arg ' - 'to snprintf.' % (match.group(1), match.group(2))) - - # Check if some verboten C functions are being used. - if Search(r'\bsprintf\s*\(', line): - error(filename, linenum, 'runtime/printf', 5, - 'Never use sprintf. Use snprintf instead.') - match = Search(r'\b(strcpy|strcat)\s*\(', line) - if match: - error(filename, linenum, 'runtime/printf', 4, - 'Almost always, snprintf is better than %s' % match.group(1)) + line = clean_lines.elided[linenum] + + # When snprintf is used, the second argument shouldn't be a literal. + match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line) + if match and match.group(2) != '0': + # If 2nd arg is zero, snprintf is used to calculate size. + error(filename, linenum, 'runtime/printf', 3, + 'If you can, use sizeof(%s) instead of %s as the 2nd arg ' + 'to snprintf.' % (match.group(1), match.group(2))) + + # Check if some verboten C functions are being used. + if Search(r'\bsprintf\s*\(', line): + error(filename, linenum, 'runtime/printf', 5, + 'Never use sprintf. Use snprintf instead.') + match = Search(r'\b(strcpy|strcat)\s*\(', line) + if match: + error(filename, linenum, 'runtime/printf', 4, + 'Almost always, snprintf is better than %s' % match.group(1)) def IsDerivedFunction(clean_lines, linenum): - """Check if current line contains an inherited function. + """Check if current line contains an inherited function. Args: clean_lines: A CleansedLines instance containing the file. @@ -5016,20 +5048,20 @@ def IsDerivedFunction(clean_lines, linenum): True if current line contains a function with "override" virt-specifier. """ - # Scan back a few lines for start of current function - for i in xrange(linenum, max(-1, linenum - 10), -1): - match = Match(r'^([^()]*\w+)\(', clean_lines.elided[i]) - if match: - # Look for "override" after the matching closing parenthesis - line, _, closing_paren = CloseExpression( - clean_lines, i, len(match.group(1))) - return (closing_paren >= 0 and - Search(r'\boverride\b', line[closing_paren:])) - return False + # Scan back a few lines for start of current function + for i in xrange(linenum, max(-1, linenum - 10), -1): + match = Match(r'^([^()]*\w+)\(', clean_lines.elided[i]) + if match: + # Look for "override" after the matching closing parenthesis + line, _, closing_paren = CloseExpression(clean_lines, i, + len(match.group(1))) + return (closing_paren >= 0 and + Search(r'\boverride\b', line[closing_paren:])) + return False def IsOutOfLineMethodDefinition(clean_lines, linenum): - """Check if current line contains an out-of-line method definition. + """Check if current line contains an out-of-line method definition. Args: clean_lines: A CleansedLines instance containing the file. @@ -5037,15 +5069,16 @@ def IsOutOfLineMethodDefinition(clean_lines, linenum): Returns: True if current line contains an out-of-line method definition. """ - # Scan back a few lines for start of current function - for i in xrange(linenum, max(-1, linenum - 10), -1): - if Match(r'^([^()]*\w+)\(', clean_lines.elided[i]): - return Match(r'^[^()]*\w+::\w+\(', clean_lines.elided[i]) is not None - return False + # Scan back a few lines for start of current function + for i in xrange(linenum, max(-1, linenum - 10), -1): + if Match(r'^([^()]*\w+)\(', clean_lines.elided[i]): + return Match(r'^[^()]*\w+::\w+\(', + clean_lines.elided[i]) is not None + return False def IsInitializerList(clean_lines, linenum): - """Check if current line is inside constructor initializer list. + """Check if current line is inside constructor initializer list. Args: clean_lines: A CleansedLines instance containing the file. @@ -5054,41 +5087,41 @@ def IsInitializerList(clean_lines, linenum): True if current line appears to be inside constructor initializer list, False otherwise. """ - for i in xrange(linenum, 1, -1): - line = clean_lines.elided[i] - if i == linenum: - remove_function_body = Match(r'^(.*)\{\s*$', line) - if remove_function_body: - line = remove_function_body.group(1) - - if Search(r'\s:\s*\w+[({]', line): - # A lone colon tend to indicate the start of a constructor - # initializer list. It could also be a ternary operator, which - # also tend to appear in constructor initializer lists as - # opposed to parameter lists. - return True - if Search(r'\}\s*,\s*$', line): - # A closing brace followed by a comma is probably the end of a - # brace-initialized member in constructor initializer list. - return True - if Search(r'[{};]\s*$', line): - # Found one of the following: - # - A closing brace or semicolon, probably the end of the previous - # function. - # - An opening brace, probably the start of current class or namespace. - # - # Current line is probably not inside an initializer list since - # we saw one of those things without seeing the starting colon. - return False - - # Got to the beginning of the file without seeing the start of - # constructor initializer list. - return False - - -def CheckForNonConstReference(filename, clean_lines, linenum, - nesting_state, error): - """Check for non-const references. + for i in xrange(linenum, 1, -1): + line = clean_lines.elided[i] + if i == linenum: + remove_function_body = Match(r'^(.*)\{\s*$', line) + if remove_function_body: + line = remove_function_body.group(1) + + if Search(r'\s:\s*\w+[({]', line): + # A lone colon tend to indicate the start of a constructor + # initializer list. It could also be a ternary operator, which + # also tend to appear in constructor initializer lists as + # opposed to parameter lists. + return True + if Search(r'\}\s*,\s*$', line): + # A closing brace followed by a comma is probably the end of a + # brace-initialized member in constructor initializer list. + return True + if Search(r'[{};]\s*$', line): + # Found one of the following: + # - A closing brace or semicolon, probably the end of the previous + # function. + # - An opening brace, probably the start of current class or namespace. + # + # Current line is probably not inside an initializer list since + # we saw one of those things without seeing the starting colon. + return False + + # Got to the beginning of the file without seeing the start of + # constructor initializer list. + return False + + +def CheckForNonConstReference(filename, clean_lines, linenum, nesting_state, + error): + """Check for non-const references. Separate from CheckLanguage since it scans backwards from current line, instead of scanning forward. @@ -5101,131 +5134,131 @@ def CheckForNonConstReference(filename, clean_lines, linenum, the current stack of nested blocks being parsed. error: The function to call with any errors found. """ - # Do nothing if there is no '&' on current line. - line = clean_lines.elided[linenum] - if '&' not in line: - return - - # If a function is inherited, current function doesn't have much of - # a choice, so any non-const references should not be blamed on - # derived function. - if IsDerivedFunction(clean_lines, linenum): - return - - # Don't warn on out-of-line method definitions, as we would warn on the - # in-line declaration, if it isn't marked with 'override'. - if IsOutOfLineMethodDefinition(clean_lines, linenum): - return - - # Long type names may be broken across multiple lines, usually in one - # of these forms: - # LongType - # ::LongTypeContinued &identifier - # LongType:: - # LongTypeContinued &identifier - # LongType< - # ...>::LongTypeContinued &identifier - # - # If we detected a type split across two lines, join the previous - # line to current line so that we can match const references - # accordingly. - # - # Note that this only scans back one line, since scanning back - # arbitrary number of lines would be expensive. If you have a type - # that spans more than 2 lines, please use a typedef. - if linenum > 1: - previous = None - if Match(r'\s*::(?:[\w<>]|::)+\s*&\s*\S', line): - # previous_line\n + ::current_line - previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+[\w<>])\s*$', - clean_lines.elided[linenum - 1]) - elif Match(r'\s*[a-zA-Z_]([\w<>]|::)+\s*&\s*\S', line): - # previous_line::\n + current_line - previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+::)\s*$', - clean_lines.elided[linenum - 1]) - if previous: - line = previous.group(1) + line.lstrip() - else: - # Check for templated parameter that is split across multiple lines - endpos = line.rfind('>') - if endpos > -1: - (_, startline, startpos) = ReverseCloseExpression( - clean_lines, linenum, endpos) - if startpos > -1 and startline < linenum: - # Found the matching < on an earlier line, collect all - # pieces up to current line. - line = '' - for i in xrange(startline, linenum + 1): - line += clean_lines.elided[i].strip() - - # Check for non-const references in function parameters. A single '&' may - # found in the following places: - # inside expression: binary & for bitwise AND - # inside expression: unary & for taking the address of something - # inside declarators: reference parameter - # We will exclude the first two cases by checking that we are not inside a - # function body, including one that was just introduced by a trailing '{'. - # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare]. - if (nesting_state.previous_stack_top and - not (isinstance(nesting_state.previous_stack_top, _ClassInfo) or - isinstance(nesting_state.previous_stack_top, _NamespaceInfo))): - # Not at toplevel, not within a class, and not within a namespace - return - - # Avoid initializer lists. We only need to scan back from the - # current line for something that starts with ':'. - # - # We don't need to check the current line, since the '&' would - # appear inside the second set of parentheses on the current line as - # opposed to the first set. - if linenum > 0: - for i in xrange(linenum - 1, max(0, linenum - 10), -1): - previous_line = clean_lines.elided[i] - if not Search(r'[),]\s*$', previous_line): - break - if Match(r'^\s*:\s+\S', previous_line): + # Do nothing if there is no '&' on current line. + line = clean_lines.elided[linenum] + if '&' not in line: + return + + # If a function is inherited, current function doesn't have much of + # a choice, so any non-const references should not be blamed on + # derived function. + if IsDerivedFunction(clean_lines, linenum): + return + + # Don't warn on out-of-line method definitions, as we would warn on the + # in-line declaration, if it isn't marked with 'override'. + if IsOutOfLineMethodDefinition(clean_lines, linenum): + return + + # Long type names may be broken across multiple lines, usually in one + # of these forms: + # LongType + # ::LongTypeContinued &identifier + # LongType:: + # LongTypeContinued &identifier + # LongType< + # ...>::LongTypeContinued &identifier + # + # If we detected a type split across two lines, join the previous + # line to current line so that we can match const references + # accordingly. + # + # Note that this only scans back one line, since scanning back + # arbitrary number of lines would be expensive. If you have a type + # that spans more than 2 lines, please use a typedef. + if linenum > 1: + previous = None + if Match(r'\s*::(?:[\w<>]|::)+\s*&\s*\S', line): + # previous_line\n + ::current_line + previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+[\w<>])\s*$', + clean_lines.elided[linenum - 1]) + elif Match(r'\s*[a-zA-Z_]([\w<>]|::)+\s*&\s*\S', line): + # previous_line::\n + current_line + previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+::)\s*$', + clean_lines.elided[linenum - 1]) + if previous: + line = previous.group(1) + line.lstrip() + else: + # Check for templated parameter that is split across multiple lines + endpos = line.rfind('>') + if endpos > -1: + (_, startline, startpos) = ReverseCloseExpression( + clean_lines, linenum, endpos) + if startpos > -1 and startline < linenum: + # Found the matching < on an earlier line, collect all + # pieces up to current line. + line = '' + for i in xrange(startline, linenum + 1): + line += clean_lines.elided[i].strip() + + # Check for non-const references in function parameters. A single '&' may + # found in the following places: + # inside expression: binary & for bitwise AND + # inside expression: unary & for taking the address of something + # inside declarators: reference parameter + # We will exclude the first two cases by checking that we are not inside a + # function body, including one that was just introduced by a trailing '{'. + # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare]. + if (nesting_state.previous_stack_top and + not (isinstance(nesting_state.previous_stack_top, _ClassInfo) or + isinstance(nesting_state.previous_stack_top, _NamespaceInfo))): + # Not at toplevel, not within a class, and not within a namespace + return + + # Avoid initializer lists. We only need to scan back from the + # current line for something that starts with ':'. + # + # We don't need to check the current line, since the '&' would + # appear inside the second set of parentheses on the current line as + # opposed to the first set. + if linenum > 0: + for i in xrange(linenum - 1, max(0, linenum - 10), -1): + previous_line = clean_lines.elided[i] + if not Search(r'[),]\s*$', previous_line): + break + if Match(r'^\s*:\s+\S', previous_line): + return + + # Avoid preprocessors + if Search(r'\\\s*$', line): return - # Avoid preprocessors - if Search(r'\\\s*$', line): - return - - # Avoid constructor initializer lists - if IsInitializerList(clean_lines, linenum): - return - - # We allow non-const references in a few standard places, like functions - # called "swap()" or iostream operators like "<<" or ">>". Do not check - # those function parameters. - # - # We also accept & in static_assert, which looks like a function but - # it's actually a declaration expression. - whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|' - r'operator\s*[<>][<>]|' - r'static_assert|COMPILE_ASSERT' - r')\s*\(') - if Search(whitelisted_functions, line): - return - elif not Search(r'\S+\([^)]*$', line): - # Don't see a whitelisted function on this line. Actually we - # didn't see any function name on this line, so this is likely a - # multi-line parameter list. Try a bit harder to catch this case. - for i in xrange(2): - if (linenum > i and - Search(whitelisted_functions, clean_lines.elided[linenum - i - 1])): + # Avoid constructor initializer lists + if IsInitializerList(clean_lines, linenum): return - decls = ReplaceAll(r'{[^}]*}', ' ', line) # exclude function body - for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls): - if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter): - error(filename, linenum, 'runtime/references', 2, - 'Is this a non-const reference? ' - 'If so, make const or use a pointer: ' + - ReplaceAll(' *<', '<', parameter)) + # We allow non-const references in a few standard places, like functions + # called "swap()" or iostream operators like "<<" or ">>". Do not check + # those function parameters. + # + # We also accept & in static_assert, which looks like a function but + # it's actually a declaration expression. + whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|' + r'operator\s*[<>][<>]|' + r'static_assert|COMPILE_ASSERT' + r')\s*\(') + if Search(whitelisted_functions, line): + return + elif not Search(r'\S+\([^)]*$', line): + # Don't see a whitelisted function on this line. Actually we + # didn't see any function name on this line, so this is likely a + # multi-line parameter list. Try a bit harder to catch this case. + for i in xrange(2): + if (linenum > i and Search(whitelisted_functions, + clean_lines.elided[linenum - i - 1])): + return + + decls = ReplaceAll(r'{[^}]*}', ' ', line) # exclude function body + for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls): + if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter): + error(filename, linenum, 'runtime/references', 2, + 'Is this a non-const reference? ' + 'If so, make const or use a pointer: ' + ReplaceAll( + ' *<', '<', parameter)) def CheckCasts(filename, clean_lines, linenum, error): - """Various cast related checks. + """Various cast related checks. Args: filename: The name of the current file. @@ -5233,118 +5266,116 @@ def CheckCasts(filename, clean_lines, linenum, error): linenum: The number of the line to check. error: The function to call with any errors found. """ - line = clean_lines.elided[linenum] - - # Check to see if they're using an conversion function cast. - # I just try to capture the most common basic types, though there are more. - # Parameterless conversion functions, such as bool(), are allowed as they are - # probably a member operator declaration or default constructor. - match = Search( - r'(\bnew\s+|\S<\s*(?:const\s+)?)?\b' - r'(int|float|double|bool|char|int32|uint32|int64|uint64)' - r'(\([^)].*)', line) - expecting_function = ExpectingFunctionArgs(clean_lines, linenum) - if match and not expecting_function: - matched_type = match.group(2) - - # matched_new_or_template is used to silence two false positives: - # - New operators - # - Template arguments with function types + line = clean_lines.elided[linenum] + + # Check to see if they're using an conversion function cast. + # I just try to capture the most common basic types, though there are more. + # Parameterless conversion functions, such as bool(), are allowed as they are + # probably a member operator declaration or default constructor. + match = Search(r'(\bnew\s+|\S<\s*(?:const\s+)?)?\b' + r'(int|float|double|bool|char|int32|uint32|int64|uint64)' + r'(\([^)].*)', line) + expecting_function = ExpectingFunctionArgs(clean_lines, linenum) + if match and not expecting_function: + matched_type = match.group(2) + + # matched_new_or_template is used to silence two false positives: + # - New operators + # - Template arguments with function types + # + # For template arguments, we match on types immediately following + # an opening bracket without any spaces. This is a fast way to + # silence the common case where the function type is the first + # template argument. False negative with less-than comparison is + # avoided because those operators are usually followed by a space. + # + # function // bracket + no space = false positive + # value < double(42) // bracket + space = true positive + matched_new_or_template = match.group(1) + + # Avoid arrays by looking for brackets that come after the closing + # parenthesis. + if Match(r'\([^()]+\)\s*\[', match.group(3)): + return + + # Other things to ignore: + # - Function pointers + # - Casts to pointer types + # - Placement new + # - Alias declarations + matched_funcptr = match.group(3) + if (matched_new_or_template is None and not (matched_funcptr and (Match( + r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(', + matched_funcptr) or matched_funcptr.startswith('(*)'))) and + not Match(r'\s*using\s+\S+\s*=\s*' + matched_type, line) and + not Search(r'new\(\S+\)\s*' + matched_type, line)): + error(filename, linenum, 'readability/casting', 4, + 'Using deprecated casting style. ' + 'Use static_cast<%s>(...) instead' % matched_type) + + if not expecting_function: + CheckCStyleCast(filename, clean_lines, linenum, 'static_cast', + r'\((int|float|double|bool|char|u?int(16|32|64))\)', + error) + + # This doesn't catch all cases. Consider (const char * const)"hello". # - # For template arguments, we match on types immediately following - # an opening bracket without any spaces. This is a fast way to - # silence the common case where the function type is the first - # template argument. False negative with less-than comparison is - # avoided because those operators are usually followed by a space. + # (char *) "foo" should always be a const_cast (reinterpret_cast won't + # compile). + if CheckCStyleCast(filename, clean_lines, linenum, 'const_cast', + r'\((char\s?\*+\s?)\)\s*"', error): + pass + else: + # Check pointer casts for other than string constants + CheckCStyleCast(filename, clean_lines, linenum, 'reinterpret_cast', + r'\((\w+\s?\*+\s?)\)', error) + + # In addition, we look for people taking the address of a cast. This + # is dangerous -- casts can assign to temporaries, so the pointer doesn't + # point where you think. # - # function // bracket + no space = false positive - # value < double(42) // bracket + space = true positive - matched_new_or_template = match.group(1) - - # Avoid arrays by looking for brackets that come after the closing - # parenthesis. - if Match(r'\([^()]+\)\s*\[', match.group(3)): - return - - # Other things to ignore: - # - Function pointers - # - Casts to pointer types - # - Placement new - # - Alias declarations - matched_funcptr = match.group(3) - if (matched_new_or_template is None and - not (matched_funcptr and - (Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(', - matched_funcptr) or - matched_funcptr.startswith('(*)'))) and - not Match(r'\s*using\s+\S+\s*=\s*' + matched_type, line) and - not Search(r'new\(\S+\)\s*' + matched_type, line)): - error(filename, linenum, 'readability/casting', 4, - 'Using deprecated casting style. ' - 'Use static_cast<%s>(...) instead' % - matched_type) - - if not expecting_function: - CheckCStyleCast(filename, clean_lines, linenum, 'static_cast', - r'\((int|float|double|bool|char|u?int(16|32|64))\)', error) - - # This doesn't catch all cases. Consider (const char * const)"hello". - # - # (char *) "foo" should always be a const_cast (reinterpret_cast won't - # compile). - if CheckCStyleCast(filename, clean_lines, linenum, 'const_cast', - r'\((char\s?\*+\s?)\)\s*"', error): - pass - else: - # Check pointer casts for other than string constants - CheckCStyleCast(filename, clean_lines, linenum, 'reinterpret_cast', - r'\((\w+\s?\*+\s?)\)', error) - - # In addition, we look for people taking the address of a cast. This - # is dangerous -- casts can assign to temporaries, so the pointer doesn't - # point where you think. - # - # Some non-identifier character is required before the '&' for the - # expression to be recognized as a cast. These are casts: - # expression = &static_cast(temporary()); - # function(&(int*)(temporary())); - # - # This is not a cast: - # reference_type&(int* function_param); - match = Search( - r'(?:[^\w]&\(([^)*][^)]*)\)[\w(])|' - r'(?:[^\w]&(static|dynamic|down|reinterpret)_cast\b)', line) - if match: - # Try a better error message when the & is bound to something - # dereferenced by the casted pointer, as opposed to the casted - # pointer itself. - parenthesis_error = False - match = Match(r'^(.*&(?:static|dynamic|down|reinterpret)_cast\b)<', line) + # Some non-identifier character is required before the '&' for the + # expression to be recognized as a cast. These are casts: + # expression = &static_cast(temporary()); + # function(&(int*)(temporary())); + # + # This is not a cast: + # reference_type&(int* function_param); + match = Search(r'(?:[^\w]&\(([^)*][^)]*)\)[\w(])|' + r'(?:[^\w]&(static|dynamic|down|reinterpret)_cast\b)', line) if match: - _, y1, x1 = CloseExpression(clean_lines, linenum, len(match.group(1))) - if x1 >= 0 and clean_lines.elided[y1][x1] == '(': - _, y2, x2 = CloseExpression(clean_lines, y1, x1) - if x2 >= 0: - extended_line = clean_lines.elided[y2][x2:] - if y2 < clean_lines.NumLines() - 1: - extended_line += clean_lines.elided[y2 + 1] - if Match(r'\s*(?:->|\[)', extended_line): - parenthesis_error = True - - if parenthesis_error: - error(filename, linenum, 'readability/casting', 4, - ('Are you taking an address of something dereferenced ' - 'from a cast? Wrapping the dereferenced expression in ' - 'parentheses will make the binding more obvious')) - else: - error(filename, linenum, 'runtime/casting', 4, - ('Are you taking an address of a cast? ' - 'This is dangerous: could be a temp var. ' - 'Take the address before doing the cast, rather than after')) + # Try a better error message when the & is bound to something + # dereferenced by the casted pointer, as opposed to the casted + # pointer itself. + parenthesis_error = False + match = Match(r'^(.*&(?:static|dynamic|down|reinterpret)_cast\b)<', + line) + if match: + _, y1, x1 = CloseExpression(clean_lines, linenum, + len(match.group(1))) + if x1 >= 0 and clean_lines.elided[y1][x1] == '(': + _, y2, x2 = CloseExpression(clean_lines, y1, x1) + if x2 >= 0: + extended_line = clean_lines.elided[y2][x2:] + if y2 < clean_lines.NumLines() - 1: + extended_line += clean_lines.elided[y2 + 1] + if Match(r'\s*(?:->|\[)', extended_line): + parenthesis_error = True + + if parenthesis_error: + error(filename, linenum, 'readability/casting', 4, + ('Are you taking an address of something dereferenced ' + 'from a cast? Wrapping the dereferenced expression in ' + 'parentheses will make the binding more obvious')) + else: + error(filename, linenum, 'runtime/casting', 4, + ('Are you taking an address of a cast? ' + 'This is dangerous: could be a temp var. ' + 'Take the address before doing the cast, rather than after')) def CheckCStyleCast(filename, clean_lines, linenum, cast_type, pattern, error): - """Checks for a C-style cast by looking for the pattern. + """Checks for a C-style cast by looking for the pattern. Args: filename: The name of the current file. @@ -5359,96 +5390,96 @@ def CheckCStyleCast(filename, clean_lines, linenum, cast_type, pattern, error): True if an error was emitted. False otherwise. """ - line = clean_lines.elided[linenum] - match = Search(pattern, line) - if not match: - return False + line = clean_lines.elided[linenum] + match = Search(pattern, line) + if not match: + return False - # Exclude lines with keywords that tend to look like casts - context = line[0:match.start(1) - 1] - if Match(r'.*\b(?:sizeof|alignof|alignas|[_A-Z][_A-Z0-9]*)\s*$', context): - return False + # Exclude lines with keywords that tend to look like casts + context = line[0:match.start(1) - 1] + if Match(r'.*\b(?:sizeof|alignof|alignas|[_A-Z][_A-Z0-9]*)\s*$', context): + return False - # Try expanding current context to see if we one level of - # parentheses inside a macro. - if linenum > 0: - for i in xrange(linenum - 1, max(0, linenum - 5), -1): - context = clean_lines.elided[i] + context - if Match(r'.*\b[_A-Z][_A-Z0-9]*\s*\((?:\([^()]*\)|[^()])*$', context): - return False + # Try expanding current context to see if we one level of + # parentheses inside a macro. + if linenum > 0: + for i in xrange(linenum - 1, max(0, linenum - 5), -1): + context = clean_lines.elided[i] + context + if Match(r'.*\b[_A-Z][_A-Z0-9]*\s*\((?:\([^()]*\)|[^()])*$', context): + return False - # operator++(int) and operator--(int) - if context.endswith(' operator++') or context.endswith(' operator--'): - return False + # operator++(int) and operator--(int) + if context.endswith(' operator++') or context.endswith(' operator--'): + return False - # A single unnamed argument for a function tends to look like old - # style cast. If we see those, don't issue warnings for deprecated - # casts, instead issue warnings for unnamed arguments where - # appropriate. - # - # These are things that we want warnings for, since the style guide - # explicitly require all parameters to be named: - # Function(int); - # Function(int) { - # ConstMember(int) const; - # ConstMember(int) const { - # ExceptionMember(int) throw (...); - # ExceptionMember(int) throw (...) { - # PureVirtual(int) = 0; - # [](int) -> bool { - # - # These are functions of some sort, where the compiler would be fine - # if they had named parameters, but people often omit those - # identifiers to reduce clutter: - # (FunctionPointer)(int); - # (FunctionPointer)(int) = value; - # Function((function_pointer_arg)(int)) - # Function((function_pointer_arg)(int), int param) - # ; - # <(FunctionPointerTemplateArgument)(int)>; - remainder = line[match.end(0):] - if Match(r'^\s*(?:;|const\b|throw\b|final\b|override\b|[=>{),]|->)', - remainder): - # Looks like an unnamed parameter. - - # Don't warn on any kind of template arguments. - if Match(r'^\s*>', remainder): - return False - - # Don't warn on assignments to function pointers, but keep warnings for - # unnamed parameters to pure virtual functions. Note that this pattern - # will also pass on assignments of "0" to function pointers, but the - # preferred values for those would be "nullptr" or "NULL". - matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder) - if matched_zero and matched_zero.group(1) != '0': - return False - - # Don't warn on function pointer declarations. For this we need - # to check what came before the "(type)" string. - if Match(r'.*\)\s*$', line[0:match.start(0)]): - return False - - # Don't warn if the parameter is named with block comments, e.g.: - # Function(int /*unused_param*/); - raw_line = clean_lines.raw_lines[linenum] - if '/*' in raw_line: - return False - - # Passed all filters, issue warning here. - error(filename, linenum, 'readability/function', 3, - 'All parameters should be named in a function') - return True + # A single unnamed argument for a function tends to look like old + # style cast. If we see those, don't issue warnings for deprecated + # casts, instead issue warnings for unnamed arguments where + # appropriate. + # + # These are things that we want warnings for, since the style guide + # explicitly require all parameters to be named: + # Function(int); + # Function(int) { + # ConstMember(int) const; + # ConstMember(int) const { + # ExceptionMember(int) throw (...); + # ExceptionMember(int) throw (...) { + # PureVirtual(int) = 0; + # [](int) -> bool { + # + # These are functions of some sort, where the compiler would be fine + # if they had named parameters, but people often omit those + # identifiers to reduce clutter: + # (FunctionPointer)(int); + # (FunctionPointer)(int) = value; + # Function((function_pointer_arg)(int)) + # Function((function_pointer_arg)(int), int param) + # ; + # <(FunctionPointerTemplateArgument)(int)>; + remainder = line[match.end(0):] + if Match(r'^\s*(?:;|const\b|throw\b|final\b|override\b|[=>{),]|->)', + remainder): + # Looks like an unnamed parameter. + + # Don't warn on any kind of template arguments. + if Match(r'^\s*>', remainder): + return False - # At this point, all that should be left is actual casts. - error(filename, linenum, 'readability/casting', 4, - 'Using C-style cast. Use %s<%s>(...) instead' % - (cast_type, match.group(1))) + # Don't warn on assignments to function pointers, but keep warnings for + # unnamed parameters to pure virtual functions. Note that this pattern + # will also pass on assignments of "0" to function pointers, but the + # preferred values for those would be "nullptr" or "NULL". + matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder) + if matched_zero and matched_zero.group(1) != '0': + return False - return True + # Don't warn on function pointer declarations. For this we need + # to check what came before the "(type)" string. + if Match(r'.*\)\s*$', line[0:match.start(0)]): + return False + + # Don't warn if the parameter is named with block comments, e.g.: + # Function(int /*unused_param*/); + raw_line = clean_lines.raw_lines[linenum] + if '/*' in raw_line: + return False + + # Passed all filters, issue warning here. + error(filename, linenum, 'readability/function', 3, + 'All parameters should be named in a function') + return True + + # At this point, all that should be left is actual casts. + error(filename, linenum, 'readability/casting', 4, + 'Using C-style cast. Use %s<%s>(...) instead' % + (cast_type, match.group(1))) + + return True def ExpectingFunctionArgs(clean_lines, linenum): - """Checks whether where function type arguments are expected. + """Checks whether where function type arguments are expected. Args: clean_lines: A CleansedLines instance containing the file. @@ -5458,78 +5489,107 @@ def ExpectingFunctionArgs(clean_lines, linenum): True if the line at 'linenum' is inside something that expects arguments of function types. """ - line = clean_lines.elided[linenum] - return (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or - (linenum >= 2 and - (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$', - clean_lines.elided[linenum - 1]) or - Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$', - clean_lines.elided[linenum - 2]) or - Search(r'\bstd::m?function\s*\<\s*$', - clean_lines.elided[linenum - 1])))) + line = clean_lines.elided[linenum] + return (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or + (linenum >= 2 and + (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$', + clean_lines.elided[linenum - 1]) or + Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$', + clean_lines.elided[linenum - 2]) or + Search(r'\bstd::m?function\s*\<\s*$', + clean_lines.elided[linenum - 1])))) _HEADERS_CONTAINING_TEMPLATES = ( - ('', ('deque',)), - ('', ('unary_function', 'binary_function', - 'plus', 'minus', 'multiplies', 'divides', 'modulus', - 'negate', - 'equal_to', 'not_equal_to', 'greater', 'less', - 'greater_equal', 'less_equal', - 'logical_and', 'logical_or', 'logical_not', - 'unary_negate', 'not1', 'binary_negate', 'not2', - 'bind1st', 'bind2nd', - 'pointer_to_unary_function', - 'pointer_to_binary_function', - 'ptr_fun', - 'mem_fun_t', 'mem_fun', 'mem_fun1_t', 'mem_fun1_ref_t', - 'mem_fun_ref_t', - 'const_mem_fun_t', 'const_mem_fun1_t', - 'const_mem_fun_ref_t', 'const_mem_fun1_ref_t', - 'mem_fun_ref', - )), - ('', ('numeric_limits',)), - ('', ('list',)), - ('', ('map', 'multimap',)), - ('', ('allocator',)), - ('', ('queue', 'priority_queue',)), - ('', ('set', 'multiset',)), - ('', ('stack',)), - ('', ('char_traits', 'basic_string',)), - ('', ('tuple',)), - ('', ('pair',)), - ('', ('vector',)), + ('', ('deque', )), + ('', ( + 'unary_function', + 'binary_function', + 'plus', + 'minus', + 'multiplies', + 'divides', + 'modulus', + 'negate', + 'equal_to', + 'not_equal_to', + 'greater', + 'less', + 'greater_equal', + 'less_equal', + 'logical_and', + 'logical_or', + 'logical_not', + 'unary_negate', + 'not1', + 'binary_negate', + 'not2', + 'bind1st', + 'bind2nd', + 'pointer_to_unary_function', + 'pointer_to_binary_function', + 'ptr_fun', + 'mem_fun_t', + 'mem_fun', + 'mem_fun1_t', + 'mem_fun1_ref_t', + 'mem_fun_ref_t', + 'const_mem_fun_t', + 'const_mem_fun1_t', + 'const_mem_fun_ref_t', + 'const_mem_fun1_ref_t', + 'mem_fun_ref', )), + ('', ('numeric_limits', )), + ('', ('list', )), + ('', ( + 'map', + 'multimap', )), + ('', ('allocator', )), + ('', ( + 'queue', + 'priority_queue', )), + ('', ( + 'set', + 'multiset', )), + ('', ('stack', )), + ('', ( + 'char_traits', + 'basic_string', )), + ('', ('tuple', )), + ('', ('pair', )), + ('', ('vector', )), # gcc extensions. # Note: std::hash is their hash, ::hash is our hash - ('', ('hash_map', 'hash_multimap',)), - ('', ('hash_set', 'hash_multiset',)), - ('', ('slist',)), - ) + ('', ( + 'hash_map', + 'hash_multimap', )), + ('', ( + 'hash_set', + 'hash_multiset', )), + ('', ('slist', )), ) _RE_PATTERN_STRING = re.compile(r'\bstring\b') _re_pattern_algorithm_header = [] for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap', 'transform'): - # Match max(..., ...), max(..., ...), but not foo->max, foo.max or - # type::max(). - _re_pattern_algorithm_header.append( - (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'), - _template, - '')) + # Match max(..., ...), max(..., ...), but not foo->max, foo.max or + # type::max(). + _re_pattern_algorithm_header.append( + (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'), _template, + '')) _re_pattern_templates = [] for _header, _templates in _HEADERS_CONTAINING_TEMPLATES: - for _template in _templates: - _re_pattern_templates.append( - (re.compile(r'(\<|\b)' + _template + r'\s*\<'), - _template + '<>', - _header)) + for _template in _templates: + _re_pattern_templates.append( + (re.compile(r'(\<|\b)' + _template + r'\s*\<'), _template + '<>', + _header)) def FilesBelongToSameModule(filename_cc, filename_h): - """Check if these two filenames belong to the same module. + """Check if these two filenames belong to the same module. The concept of a 'module' here is a as follows: foo.h, foo-inl.h, foo.cc, foo_test.cc and foo_unittest.cc belong to the @@ -5558,33 +5618,33 @@ def FilesBelongToSameModule(filename_cc, filename_h): string: the additional prefix needed to open the header file. """ - if not filename_cc.endswith('.cc'): - return (False, '') - filename_cc = filename_cc[:-len('.cc')] - if filename_cc.endswith('_unittest'): - filename_cc = filename_cc[:-len('_unittest')] - elif filename_cc.endswith('_test'): - filename_cc = filename_cc[:-len('_test')] - filename_cc = filename_cc.replace('/public/', '/') - filename_cc = filename_cc.replace('/internal/', '/') - - if not filename_h.endswith('.h'): - return (False, '') - filename_h = filename_h[:-len('.h')] - if filename_h.endswith('-inl'): - filename_h = filename_h[:-len('-inl')] - filename_h = filename_h.replace('/public/', '/') - filename_h = filename_h.replace('/internal/', '/') - - files_belong_to_same_module = filename_cc.endswith(filename_h) - common_path = '' - if files_belong_to_same_module: - common_path = filename_cc[:-len(filename_h)] - return files_belong_to_same_module, common_path + if not filename_cc.endswith('.cc'): + return (False, '') + filename_cc = filename_cc[:-len('.cc')] + if filename_cc.endswith('_unittest'): + filename_cc = filename_cc[:-len('_unittest')] + elif filename_cc.endswith('_test'): + filename_cc = filename_cc[:-len('_test')] + filename_cc = filename_cc.replace('/public/', '/') + filename_cc = filename_cc.replace('/internal/', '/') + + if not filename_h.endswith('.h'): + return (False, '') + filename_h = filename_h[:-len('.h')] + if filename_h.endswith('-inl'): + filename_h = filename_h[:-len('-inl')] + filename_h = filename_h.replace('/public/', '/') + filename_h = filename_h.replace('/internal/', '/') + + files_belong_to_same_module = filename_cc.endswith(filename_h) + common_path = '' + if files_belong_to_same_module: + common_path = filename_cc[:-len(filename_h)] + return files_belong_to_same_module, common_path def UpdateIncludeState(filename, include_dict, io=codecs): - """Fill up the include_dict with new includes found from the file. + """Fill up the include_dict with new includes found from the file. Args: filename: the name of the header to read. @@ -5594,25 +5654,28 @@ def UpdateIncludeState(filename, include_dict, io=codecs): Returns: True if a header was successfully added. False otherwise. """ - headerfile = None - try: - headerfile = io.open(filename, 'r', 'utf8', 'replace') - except IOError: - return False - linenum = 0 - for line in headerfile: - linenum += 1 - clean_line = CleanseComments(line) - match = _RE_PATTERN_INCLUDE.search(clean_line) - if match: - include = match.group(2) - include_dict.setdefault(include, linenum) - return True + headerfile = None + try: + headerfile = io.open(filename, 'r', 'utf8', 'replace') + except IOError: + return False + linenum = 0 + for line in headerfile: + linenum += 1 + clean_line = CleanseComments(line) + match = _RE_PATTERN_INCLUDE.search(clean_line) + if match: + include = match.group(2) + include_dict.setdefault(include, linenum) + return True -def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error, +def CheckForIncludeWhatYouUse(filename, + clean_lines, + include_state, + error, io=codecs): - """Reports for missing stl includes. + """Reports for missing stl includes. This function will output warnings to make sure you are including the headers necessary for the stl containers and functions that you use. We only give one @@ -5628,87 +5691,88 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error, io: The IO factory to use to read the header file. Provided for unittest injection. """ - required = {} # A map of header name to linenumber and the template entity. - # Example of required: { '': (1219, 'less<>') } + required = {} # A map of header name to linenumber and the template entity. + # Example of required: { '': (1219, 'less<>') } - for linenum in xrange(clean_lines.NumLines()): - line = clean_lines.elided[linenum] - if not line or line[0] == '#': - continue + for linenum in xrange(clean_lines.NumLines()): + line = clean_lines.elided[linenum] + if not line or line[0] == '#': + continue - # String is special -- it is a non-templatized type in STL. - matched = _RE_PATTERN_STRING.search(line) - if matched: - # Don't warn about strings in non-STL namespaces: - # (We check only the first match per line; good enough.) - prefix = line[:matched.start()] - if prefix.endswith('std::') or not prefix.endswith('::'): - required[''] = (linenum, 'string') - - for pattern, template, header in _re_pattern_algorithm_header: - if pattern.search(line): - required[header] = (linenum, template) - - # The following function is just a speed up, no semantics are changed. - if not '<' in line: # Reduces the cpu time usage by skipping lines. - continue - - for pattern, template, header in _re_pattern_templates: - if pattern.search(line): - required[header] = (linenum, template) - - # The policy is that if you #include something in foo.h you don't need to - # include it again in foo.cc. Here, we will look at possible includes. - # Let's flatten the include_state include_list and copy it into a dictionary. - include_dict = dict([item for sublist in include_state.include_list - for item in sublist]) - - # Did we find the header for this file (if any) and successfully load it? - header_found = False - - # Use the absolute path so that matching works properly. - abs_filename = FileInfo(filename).FullName() - - # For Emacs's flymake. - # If cpplint is invoked from Emacs's flymake, a temporary file is generated - # by flymake and that file name might end with '_flymake.cc'. In that case, - # restore original file name here so that the corresponding header file can be - # found. - # e.g. If the file name is 'foo_flymake.cc', we should search for 'foo.h' - # instead of 'foo_flymake.h' - abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename) - - # include_dict is modified during iteration, so we iterate over a copy of - # the keys. - header_keys = include_dict.keys() - for header in header_keys: - (same_module, common_path) = FilesBelongToSameModule(abs_filename, header) - fullpath = common_path + header - if same_module and UpdateIncludeState(fullpath, include_dict, io): - header_found = True - - # If we can't find the header file for a .cc, assume it's because we don't - # know where to look. In that case we'll give up as we're not sure they - # didn't include it in the .h file. - # TODO(unknown): Do a better job of finding .h files so we are confident that - # not having the .h file means there isn't one. - if filename.endswith('.cc') and not header_found: - return - - # All the lines have been processed, report the errors found. - for required_header_unstripped in required: - template = required[required_header_unstripped][1] - if required_header_unstripped.strip('<>"') not in include_dict: - error(filename, required[required_header_unstripped][0], - 'build/include_what_you_use', 4, - 'Add #include ' + required_header_unstripped + ' for ' + template) + # String is special -- it is a non-templatized type in STL. + matched = _RE_PATTERN_STRING.search(line) + if matched: + # Don't warn about strings in non-STL namespaces: + # (We check only the first match per line; good enough.) + prefix = line[:matched.start()] + if prefix.endswith('std::') or not prefix.endswith('::'): + required[''] = (linenum, 'string') + + for pattern, template, header in _re_pattern_algorithm_header: + if pattern.search(line): + required[header] = (linenum, template) + + # The following function is just a speed up, no semantics are changed. + if not '<' in line: # Reduces the cpu time usage by skipping lines. + continue + + for pattern, template, header in _re_pattern_templates: + if pattern.search(line): + required[header] = (linenum, template) + + # The policy is that if you #include something in foo.h you don't need to + # include it again in foo.cc. Here, we will look at possible includes. + # Let's flatten the include_state include_list and copy it into a dictionary. + include_dict = dict( + [item for sublist in include_state.include_list for item in sublist]) + + # Did we find the header for this file (if any) and successfully load it? + header_found = False + + # Use the absolute path so that matching works properly. + abs_filename = FileInfo(filename).FullName() + + # For Emacs's flymake. + # If cpplint is invoked from Emacs's flymake, a temporary file is generated + # by flymake and that file name might end with '_flymake.cc'. In that case, + # restore original file name here so that the corresponding header file can be + # found. + # e.g. If the file name is 'foo_flymake.cc', we should search for 'foo.h' + # instead of 'foo_flymake.h' + abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename) + + # include_dict is modified during iteration, so we iterate over a copy of + # the keys. + header_keys = include_dict.keys() + for header in header_keys: + (same_module, common_path) = FilesBelongToSameModule(abs_filename, + header) + fullpath = common_path + header + if same_module and UpdateIncludeState(fullpath, include_dict, io): + header_found = True + + # If we can't find the header file for a .cc, assume it's because we don't + # know where to look. In that case we'll give up as we're not sure they + # didn't include it in the .h file. + # TODO(unknown): Do a better job of finding .h files so we are confident that + # not having the .h file means there isn't one. + if filename.endswith('.cc') and not header_found: + return + + # All the lines have been processed, report the errors found. + for required_header_unstripped in required: + template = required[required_header_unstripped][1] + if required_header_unstripped.strip('<>"') not in include_dict: + error(filename, required[required_header_unstripped][0], + 'build/include_what_you_use', 4, 'Add #include ' + + required_header_unstripped + ' for ' + template) _RE_PATTERN_EXPLICIT_MAKEPAIR = re.compile(r'\bmake_pair\s*<') def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error): - """Check that make_pair's template arguments are deduced. + """Check that make_pair's template arguments are deduced. G++ 4.6 in C++11 mode fails badly if make_pair's template arguments are specified explicitly, and such use isn't intended in any case. @@ -5719,17 +5783,20 @@ def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error): linenum: The number of the line to check. error: The function to call with any errors found. """ - line = clean_lines.elided[linenum] - match = _RE_PATTERN_EXPLICIT_MAKEPAIR.search(line) - if match: - error(filename, linenum, 'build/explicit_make_pair', - 4, # 4 = high confidence - 'For C++11-compatibility, omit template arguments from make_pair' - ' OR use pair directly OR if appropriate, construct a pair directly') + line = clean_lines.elided[linenum] + match = _RE_PATTERN_EXPLICIT_MAKEPAIR.search(line) + if match: + error( + filename, + linenum, + 'build/explicit_make_pair', + 4, # 4 = high confidence + 'For C++11-compatibility, omit template arguments from make_pair' + ' OR use pair directly OR if appropriate, construct a pair directly') def CheckDefaultLambdaCaptures(filename, clean_lines, linenum, error): - """Check that default lambda captures are not used. + """Check that default lambda captures are not used. Args: filename: The name of the current file. @@ -5737,24 +5804,28 @@ def CheckDefaultLambdaCaptures(filename, clean_lines, linenum, error): linenum: The number of the line to check. error: The function to call with any errors found. """ - line = clean_lines.elided[linenum] - - # A lambda introducer specifies a default capture if it starts with "[=" - # or if it starts with "[&" _not_ followed by an identifier. - match = Match(r'^(.*)\[\s*(?:=|&[^\w])', line) - if match: - # Found a potential error, check what comes after the lambda-introducer. - # If it's not open parenthesis (for lambda-declarator) or open brace - # (for compound-statement), it's not a lambda. - line, _, pos = CloseExpression(clean_lines, linenum, len(match.group(1))) - if pos >= 0 and Match(r'^\s*[{(]', line[pos:]): - error(filename, linenum, 'build/c++11', - 4, # 4 = high confidence - 'Default lambda captures are an unapproved C++ feature.') + line = clean_lines.elided[linenum] + + # A lambda introducer specifies a default capture if it starts with "[=" + # or if it starts with "[&" _not_ followed by an identifier. + match = Match(r'^(.*)\[\s*(?:=|&[^\w])', line) + if match: + # Found a potential error, check what comes after the lambda-introducer. + # If it's not open parenthesis (for lambda-declarator) or open brace + # (for compound-statement), it's not a lambda. + line, _, pos = CloseExpression(clean_lines, linenum, + len(match.group(1))) + if pos >= 0 and Match(r'^\s*[{(]', line[pos:]): + error( + filename, + linenum, + 'build/c++11', + 4, # 4 = high confidence + 'Default lambda captures are an unapproved C++ feature.') def CheckRedundantVirtual(filename, clean_lines, linenum, error): - """Check if line contains a redundant "virtual" function-specifier. + """Check if line contains a redundant "virtual" function-specifier. Args: filename: The name of the current file. @@ -5762,63 +5833,64 @@ def CheckRedundantVirtual(filename, clean_lines, linenum, error): linenum: The number of the line to check. error: The function to call with any errors found. """ - # Look for "virtual" on current line. - line = clean_lines.elided[linenum] - virtual = Match(r'^(.*)(\bvirtual\b)(.*)$', line) - if not virtual: return - - # Ignore "virtual" keywords that are near access-specifiers. These - # are only used in class base-specifier and do not apply to member - # functions. - if (Search(r'\b(public|protected|private)\s+$', virtual.group(1)) or - Match(r'^\s+(public|protected|private)\b', virtual.group(3))): - return - - # Ignore the "virtual" keyword from virtual base classes. Usually - # there is a column on the same line in these cases (virtual base - # classes are rare in google3 because multiple inheritance is rare). - if Match(r'^.*[^:]:[^:].*$', line): return - - # Look for the next opening parenthesis. This is the start of the - # parameter list (possibly on the next line shortly after virtual). - # TODO(unknown): doesn't work if there are virtual functions with - # decltype() or other things that use parentheses, but csearch suggests - # that this is rare. - end_col = -1 - end_line = -1 - start_col = len(virtual.group(2)) - for start_line in xrange(linenum, min(linenum + 3, clean_lines.NumLines())): - line = clean_lines.elided[start_line][start_col:] - parameter_list = Match(r'^([^(]*)\(', line) - if parameter_list: - # Match parentheses to find the end of the parameter list - (_, end_line, end_col) = CloseExpression( - clean_lines, start_line, start_col + len(parameter_list.group(1))) - break - start_col = 0 - - if end_col < 0: - return # Couldn't find end of parameter list, give up - - # Look for "override" or "final" after the parameter list - # (possibly on the next few lines). - for i in xrange(end_line, min(end_line + 3, clean_lines.NumLines())): - line = clean_lines.elided[i][end_col:] - match = Search(r'\b(override|final)\b', line) - if match: - error(filename, linenum, 'readability/inheritance', 4, - ('"virtual" is redundant since function is ' - 'already declared as "%s"' % match.group(1))) + # Look for "virtual" on current line. + line = clean_lines.elided[linenum] + virtual = Match(r'^(.*)(\bvirtual\b)(.*)$', line) + if not virtual: return + + # Ignore "virtual" keywords that are near access-specifiers. These + # are only used in class base-specifier and do not apply to member + # functions. + if (Search(r'\b(public|protected|private)\s+$', virtual.group(1)) or + Match(r'^\s+(public|protected|private)\b', virtual.group(3))): + return - # Set end_col to check whole lines after we are done with the - # first line. - end_col = 0 - if Search(r'[^\w]\s*$', line): - break + # Ignore the "virtual" keyword from virtual base classes. Usually + # there is a column on the same line in these cases (virtual base + # classes are rare in google3 because multiple inheritance is rare). + if Match(r'^.*[^:]:[^:].*$', line): return + + # Look for the next opening parenthesis. This is the start of the + # parameter list (possibly on the next line shortly after virtual). + # TODO(unknown): doesn't work if there are virtual functions with + # decltype() or other things that use parentheses, but csearch suggests + # that this is rare. + end_col = -1 + end_line = -1 + start_col = len(virtual.group(2)) + for start_line in xrange(linenum, min(linenum + 3, clean_lines.NumLines())): + line = clean_lines.elided[start_line][start_col:] + parameter_list = Match(r'^([^(]*)\(', line) + if parameter_list: + # Match parentheses to find the end of the parameter list + (_, end_line, end_col) = CloseExpression( + clean_lines, start_line, + start_col + len(parameter_list.group(1))) + break + start_col = 0 + + if end_col < 0: + return # Couldn't find end of parameter list, give up + + # Look for "override" or "final" after the parameter list + # (possibly on the next few lines). + for i in xrange(end_line, min(end_line + 3, clean_lines.NumLines())): + line = clean_lines.elided[i][end_col:] + match = Search(r'\b(override|final)\b', line) + if match: + error(filename, linenum, 'readability/inheritance', 4, + ('"virtual" is redundant since function is ' + 'already declared as "%s"' % match.group(1))) + + # Set end_col to check whole lines after we are done with the + # first line. + end_col = 0 + if Search(r'[^\w]\s*$', line): + break def CheckRedundantOverrideOrFinal(filename, clean_lines, linenum, error): - """Check if line contains a redundant "override" or "final" virt-specifier. + """Check if line contains a redundant "override" or "final" virt-specifier. Args: filename: The name of the current file. @@ -5826,32 +5898,30 @@ def CheckRedundantOverrideOrFinal(filename, clean_lines, linenum, error): linenum: The number of the line to check. error: The function to call with any errors found. """ - # Look for closing parenthesis nearby. We need one to confirm where - # the declarator ends and where the virt-specifier starts to avoid - # false positives. - line = clean_lines.elided[linenum] - declarator_end = line.rfind(')') - if declarator_end >= 0: - fragment = line[declarator_end:] - else: - if linenum > 1 and clean_lines.elided[linenum - 1].rfind(')') >= 0: - fragment = line + # Look for closing parenthesis nearby. We need one to confirm where + # the declarator ends and where the virt-specifier starts to avoid + # false positives. + line = clean_lines.elided[linenum] + declarator_end = line.rfind(')') + if declarator_end >= 0: + fragment = line[declarator_end:] else: - return - - # Check that at most one of "override" or "final" is present, not both - if Search(r'\boverride\b', fragment) and Search(r'\bfinal\b', fragment): - error(filename, linenum, 'readability/inheritance', 4, - ('"override" is redundant since function is ' - 'already declared as "final"')) - + if linenum > 1 and clean_lines.elided[linenum - 1].rfind(')') >= 0: + fragment = line + else: + return + # Check that at most one of "override" or "final" is present, not both + if Search(r'\boverride\b', fragment) and Search(r'\bfinal\b', fragment): + error(filename, linenum, 'readability/inheritance', 4, + ('"override" is redundant since function is ' + 'already declared as "final"')) # Returns true if we are at a new block, and it is directly # inside of a namespace. def IsBlockInNameSpace(nesting_state, is_forward_declaration): - """Checks that the new block is directly in a namespace. + """Checks that the new block is directly in a namespace. Args: nesting_state: The _NestingState object that contains info about our state. @@ -5859,21 +5929,21 @@ def IsBlockInNameSpace(nesting_state, is_forward_declaration): Returns: Whether or not the new block is directly in a namespace. """ - if is_forward_declaration: - if len(nesting_state.stack) >= 1 and ( - isinstance(nesting_state.stack[-1], _NamespaceInfo)): - return True - else: - return False + if is_forward_declaration: + if len(nesting_state.stack) >= 1 and ( + isinstance(nesting_state.stack[-1], _NamespaceInfo)): + return True + else: + return False - return (len(nesting_state.stack) > 1 and - nesting_state.stack[-1].check_namespace_indentation and - isinstance(nesting_state.stack[-2], _NamespaceInfo)) + return (len(nesting_state.stack) > 1 and + nesting_state.stack[-1].check_namespace_indentation and + isinstance(nesting_state.stack[-2], _NamespaceInfo)) def ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item, raw_lines_no_comments, linenum): - """This method determines if we should apply our namespace indentation check. + """This method determines if we should apply our namespace indentation check. Args: nesting_state: The current nesting state. @@ -5888,17 +5958,17 @@ def ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item, only works for classes and namespaces inside of a namespace. """ - is_forward_declaration = IsForwardClassDeclaration(raw_lines_no_comments, - linenum) + is_forward_declaration = IsForwardClassDeclaration(raw_lines_no_comments, + linenum) - if not (is_namespace_indent_item or is_forward_declaration): - return False + if not (is_namespace_indent_item or is_forward_declaration): + return False - # If we are in a macro, we do not want to check the namespace indentation. - if IsMacroDefinition(raw_lines_no_comments, linenum): - return False + # If we are in a macro, we do not want to check the namespace indentation. + if IsMacroDefinition(raw_lines_no_comments, linenum): + return False - return IsBlockInNameSpace(nesting_state, is_forward_declaration) + return IsBlockInNameSpace(nesting_state, is_forward_declaration) # Call this method if the line is directly inside of a namespace. @@ -5906,16 +5976,22 @@ def ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item, # an inner namespace, it cannot be indented. def CheckItemIndentationInNamespace(filename, raw_lines_no_comments, linenum, error): - line = raw_lines_no_comments[linenum] - if Match(r'^\s+', line): - error(filename, linenum, 'runtime/indentation_namespace', 4, - 'Do not indent within a namespace') - - -def ProcessLine(filename, file_extension, clean_lines, line, - include_state, function_state, nesting_state, error, + line = raw_lines_no_comments[linenum] + if Match(r'^\s+', line): + error(filename, linenum, 'runtime/indentation_namespace', 4, + 'Do not indent within a namespace') + + +def ProcessLine(filename, + file_extension, + clean_lines, + line, + include_state, + function_state, + nesting_state, + error, extra_check_functions=[]): - """Processes a single line in the file. + """Processes a single line in the file. Args: filename: Filename of the file that is being processed. @@ -5933,32 +6009,34 @@ def ProcessLine(filename, file_extension, clean_lines, line, run on each source line. Each function takes 4 arguments: filename, clean_lines, line, error """ - raw_lines = clean_lines.raw_lines - ParseNolintSuppressions(filename, raw_lines[line], line, error) - nesting_state.Update(filename, clean_lines, line, error) - CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line, - error) - if nesting_state.InAsmBlock(): return - CheckForFunctionLengths(filename, clean_lines, line, function_state, error) - CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error) - CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error) - CheckLanguage(filename, clean_lines, line, file_extension, include_state, - nesting_state, error) - CheckForNonConstReference(filename, clean_lines, line, nesting_state, error) - CheckForNonStandardConstructs(filename, clean_lines, line, - nesting_state, error) - CheckVlogArguments(filename, clean_lines, line, error) - CheckPosixThreading(filename, clean_lines, line, error) - CheckInvalidIncrement(filename, clean_lines, line, error) - CheckMakePairUsesDeduction(filename, clean_lines, line, error) - CheckDefaultLambdaCaptures(filename, clean_lines, line, error) - CheckRedundantVirtual(filename, clean_lines, line, error) - CheckRedundantOverrideOrFinal(filename, clean_lines, line, error) - for check_fn in extra_check_functions: - check_fn(filename, clean_lines, line, error) + raw_lines = clean_lines.raw_lines + ParseNolintSuppressions(filename, raw_lines[line], line, error) + nesting_state.Update(filename, clean_lines, line, error) + CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line, + error) + if nesting_state.InAsmBlock(): return + CheckForFunctionLengths(filename, clean_lines, line, function_state, error) + CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error) + CheckStyle(filename, clean_lines, line, file_extension, nesting_state, + error) + CheckLanguage(filename, clean_lines, line, file_extension, include_state, + nesting_state, error) + CheckForNonConstReference(filename, clean_lines, line, nesting_state, error) + CheckForNonStandardConstructs(filename, clean_lines, line, nesting_state, + error) + CheckVlogArguments(filename, clean_lines, line, error) + CheckPosixThreading(filename, clean_lines, line, error) + CheckInvalidIncrement(filename, clean_lines, line, error) + CheckMakePairUsesDeduction(filename, clean_lines, line, error) + CheckDefaultLambdaCaptures(filename, clean_lines, line, error) + CheckRedundantVirtual(filename, clean_lines, line, error) + CheckRedundantOverrideOrFinal(filename, clean_lines, line, error) + for check_fn in extra_check_functions: + check_fn(filename, clean_lines, line, error) + def FlagCxx11Features(filename, clean_lines, linenum, error): - """Flag those c++11 features that we only allow in certain places. + """Flag those c++11 features that we only allow in certain places. Args: filename: The name of the current file. @@ -5966,46 +6044,48 @@ def FlagCxx11Features(filename, clean_lines, linenum, error): linenum: The number of the line to check. error: The function to call with any errors found. """ - line = clean_lines.elided[linenum] - - # Flag unapproved C++11 headers. - include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line) - if include and include.group(1) in ('cfenv', - 'condition_variable', - 'fenv.h', - 'future', - 'mutex', - 'thread', - 'chrono', - 'ratio', - 'regex', - 'system_error', - ): - error(filename, linenum, 'build/c++11', 5, - ('<%s> is an unapproved C++11 header.') % include.group(1)) - - # The only place where we need to worry about C++11 keywords and library - # features in preprocessor directives is in macro definitions. - if Match(r'\s*#', line) and not Match(r'\s*#\s*define\b', line): return - - # These are classes and free functions. The classes are always - # mentioned as std::*, but we only catch the free functions if - # they're not found by ADL. They're alphabetical by header. - for top_name in ( - # type_traits - 'alignment_of', - 'aligned_union', - ): - if Search(r'\bstd::%s\b' % top_name, line): - error(filename, linenum, 'build/c++11', 5, - ('std::%s is an unapproved C++11 class or function. Send c-style ' - 'an example of where it would make your code more readable, and ' - 'they may let you use it.') % top_name) - - -def ProcessFileData(filename, file_extension, lines, error, + line = clean_lines.elided[linenum] + + # Flag unapproved C++11 headers. + include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line) + if include and include.group(1) in ( + 'cfenv', + 'condition_variable', + 'fenv.h', + 'future', + 'mutex', + 'thread', + 'chrono', + 'ratio', + 'regex', + 'system_error', ): + error(filename, linenum, 'build/c++11', 5, + ('<%s> is an unapproved C++11 header.') % include.group(1)) + + # The only place where we need to worry about C++11 keywords and library + # features in preprocessor directives is in macro definitions. + if Match(r'\s*#', line) and not Match(r'\s*#\s*define\b', line): return + + # These are classes and free functions. The classes are always + # mentioned as std::*, but we only catch the free functions if + # they're not found by ADL. They're alphabetical by header. + for top_name in ( + # type_traits + 'alignment_of', + 'aligned_union', ): + if Search(r'\bstd::%s\b' % top_name, line): + error(filename, linenum, 'build/c++11', 5, ( + 'std::%s is an unapproved C++11 class or function. Send c-style ' + 'an example of where it would make your code more readable, and ' + 'they may let you use it.') % top_name) + + +def ProcessFileData(filename, + file_extension, + lines, + error, extra_check_functions=[]): - """Performs lint checks and reports any errors to the given error function. + """Performs lint checks and reports any errors to the given error function. Args: filename: Filename of the file that is being processed. @@ -6018,44 +6098,44 @@ def ProcessFileData(filename, file_extension, lines, error, run on each source line. Each function takes 4 arguments: filename, clean_lines, line, error """ - lines = (['// marker so line numbers and indices both start at 1'] + lines + - ['// marker so line numbers end in a known way']) + lines = (['// marker so line numbers and indices both start at 1'] + lines + + ['// marker so line numbers end in a known way']) - include_state = _IncludeState() - function_state = _FunctionState() - nesting_state = NestingState() + include_state = _IncludeState() + function_state = _FunctionState() + nesting_state = NestingState() - ResetNolintSuppressions() + ResetNolintSuppressions() - CheckForCopyright(filename, lines, error) + CheckForCopyright(filename, lines, error) - RemoveMultiLineComments(filename, lines, error) - clean_lines = CleansedLines(lines) + RemoveMultiLineComments(filename, lines, error) + clean_lines = CleansedLines(lines) - if file_extension == 'h': - CheckForHeaderGuard(filename, clean_lines, error) + if file_extension == 'h': + CheckForHeaderGuard(filename, clean_lines, error) - for line in xrange(clean_lines.NumLines()): - ProcessLine(filename, file_extension, clean_lines, line, - include_state, function_state, nesting_state, error, - extra_check_functions) - FlagCxx11Features(filename, clean_lines, line, error) - nesting_state.CheckCompletedBlocks(filename, error) + for line in xrange(clean_lines.NumLines()): + ProcessLine(filename, file_extension, clean_lines, line, include_state, + function_state, nesting_state, error, extra_check_functions) + FlagCxx11Features(filename, clean_lines, line, error) + nesting_state.CheckCompletedBlocks(filename, error) - CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error) - - # Check that the .cc file has included its header if it exists. - if file_extension == 'cc': - CheckHeaderFileIncluded(filename, include_state, error) + CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error) - # We check here rather than inside ProcessLine so that we see raw - # lines rather than "cleaned" lines. - CheckForBadCharacters(filename, lines, error) + # Check that the .cc file has included its header if it exists. + if file_extension == 'cc': + CheckHeaderFileIncluded(filename, include_state, error) + + # We check here rather than inside ProcessLine so that we see raw + # lines rather than "cleaned" lines. + CheckForBadCharacters(filename, lines, error) + + CheckForNewlineAtEOF(filename, lines, error) - CheckForNewlineAtEOF(filename, lines, error) def ProcessConfigOverrides(filename): - """ Loads the configuration files and processes the config overrides. + """ Loads the configuration files and processes the config overrides. Args: filename: The name of the file being processed by the linter. @@ -6064,74 +6144,76 @@ def ProcessConfigOverrides(filename): False if the current |filename| should not be processed further. """ - abs_filename = os.path.abspath(filename) - cfg_filters = [] - keep_looking = True - while keep_looking: - abs_path, base_name = os.path.split(abs_filename) - if not base_name: - break # Reached the root directory. - - cfg_file = os.path.join(abs_path, "CPPLINT.cfg") - abs_filename = abs_path - if not os.path.isfile(cfg_file): - continue - - try: - with open(cfg_file) as file_handle: - for line in file_handle: - line, _, _ = line.partition('#') # Remove comments. - if not line.strip(): + abs_filename = os.path.abspath(filename) + cfg_filters = [] + keep_looking = True + while keep_looking: + abs_path, base_name = os.path.split(abs_filename) + if not base_name: + break # Reached the root directory. + + cfg_file = os.path.join(abs_path, "CPPLINT.cfg") + abs_filename = abs_path + if not os.path.isfile(cfg_file): continue - name, _, val = line.partition('=') - name = name.strip() - val = val.strip() - if name == 'set noparent': - keep_looking = False - elif name == 'filter': - cfg_filters.append(val) - elif name == 'exclude_files': - # When matching exclude_files pattern, use the base_name of - # the current file name or the directory name we are processing. - # For example, if we are checking for lint errors in /foo/bar/baz.cc - # and we found the .cfg file at /foo/CPPLINT.cfg, then the config - # file's "exclude_files" filter is meant to be checked against "bar" - # and not "baz" nor "bar/baz.cc". - if base_name: - pattern = re.compile(val) - if pattern.match(base_name): - sys.stderr.write('Ignoring "%s": file excluded by "%s". ' - 'File path component "%s" matches ' - 'pattern "%s"\n' % - (filename, cfg_file, base_name, val)) - return False - elif name == 'linelength': - global _line_length - try: - _line_length = int(val) - except ValueError: - sys.stderr.write('Line length must be numeric.') - else: + try: + with open(cfg_file) as file_handle: + for line in file_handle: + line, _, _ = line.partition('#') # Remove comments. + if not line.strip(): + continue + + name, _, val = line.partition('=') + name = name.strip() + val = val.strip() + if name == 'set noparent': + keep_looking = False + elif name == 'filter': + cfg_filters.append(val) + elif name == 'exclude_files': + # When matching exclude_files pattern, use the base_name of + # the current file name or the directory name we are processing. + # For example, if we are checking for lint errors in /foo/bar/baz.cc + # and we found the .cfg file at /foo/CPPLINT.cfg, then the config + # file's "exclude_files" filter is meant to be checked against "bar" + # and not "baz" nor "bar/baz.cc". + if base_name: + pattern = re.compile(val) + if pattern.match(base_name): + sys.stderr.write( + 'Ignoring "%s": file excluded by "%s". ' + 'File path component "%s" matches ' + 'pattern "%s"\n' % + (filename, cfg_file, base_name, val)) + return False + elif name == 'linelength': + global _line_length + try: + _line_length = int(val) + except ValueError: + sys.stderr.write('Line length must be numeric.') + else: + sys.stderr.write( + 'Invalid configuration option (%s) in file %s\n' % + (name, cfg_file)) + + except IOError: sys.stderr.write( - 'Invalid configuration option (%s) in file %s\n' % - (name, cfg_file)) - - except IOError: - sys.stderr.write( - "Skipping config file '%s': Can't open for reading\n" % cfg_file) - keep_looking = False + "Skipping config file '%s': Can't open for reading\n" % + cfg_file) + keep_looking = False - # Apply all the accumulated filters in reverse order (top-level directory - # config options having the least priority). - for filter in reversed(cfg_filters): - _AddFilters(filter) + # Apply all the accumulated filters in reverse order (top-level directory + # config options having the least priority). + for filter in reversed(cfg_filters): + _AddFilters(filter) - return True + return True def ProcessFile(filename, vlevel, extra_check_functions=[]): - """Does google-lint on a single file. + """Does google-lint on a single file. Args: filename: The name of the file to parse. @@ -6144,104 +6226,105 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]): arguments: filename, clean_lines, line, error """ - _SetVerboseLevel(vlevel) - _BackupFilters() + _SetVerboseLevel(vlevel) + _BackupFilters() - if not ProcessConfigOverrides(filename): - _RestoreFilters() - return - - lf_lines = [] - crlf_lines = [] - try: - # Support the UNIX convention of using "-" for stdin. Note that - # we are not opening the file with universal newline support - # (which codecs doesn't support anyway), so the resulting lines do - # contain trailing '\r' characters if we are reading a file that - # has CRLF endings. - # If after the split a trailing '\r' is present, it is removed - # below. - if filename == '-': - lines = codecs.StreamReaderWriter(sys.stdin, - codecs.getreader('utf8'), - codecs.getwriter('utf8'), - 'replace').read().split('\n') + if not ProcessConfigOverrides(filename): + _RestoreFilters() + return + + lf_lines = [] + crlf_lines = [] + try: + # Support the UNIX convention of using "-" for stdin. Note that + # we are not opening the file with universal newline support + # (which codecs doesn't support anyway), so the resulting lines do + # contain trailing '\r' characters if we are reading a file that + # has CRLF endings. + # If after the split a trailing '\r' is present, it is removed + # below. + if filename == '-': + lines = codecs.StreamReaderWriter(sys.stdin, + codecs.getreader('utf8'), + codecs.getwriter('utf8'), + 'replace').read().split('\n') + else: + lines = codecs.open(filename, 'r', 'utf8', + 'replace').read().split('\n') + + # Remove trailing '\r'. + # The -1 accounts for the extra trailing blank line we get from split() + for linenum in range(len(lines) - 1): + if lines[linenum].endswith('\r'): + lines[linenum] = lines[linenum].rstrip('\r') + crlf_lines.append(linenum + 1) + else: + lf_lines.append(linenum + 1) + + except IOError: + sys.stderr.write("Skipping input '%s': Can't open for reading\n" % + filename) + _RestoreFilters() + return + + # Note, if no dot is found, this will give the entire filename as the ext. + file_extension = filename[filename.rfind('.') + 1:] + + # When reading from stdin, the extension is unknown, so no cpplint tests + # should rely on the extension. + if filename != '-' and file_extension not in _valid_extensions: + sys.stderr.write('Ignoring %s; not a valid file name ' + '(%s)\n' % (filename, ', '.join(_valid_extensions))) else: - lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n') - - # Remove trailing '\r'. - # The -1 accounts for the extra trailing blank line we get from split() - for linenum in range(len(lines) - 1): - if lines[linenum].endswith('\r'): - lines[linenum] = lines[linenum].rstrip('\r') - crlf_lines.append(linenum + 1) - else: - lf_lines.append(linenum + 1) - - except IOError: - sys.stderr.write( - "Skipping input '%s': Can't open for reading\n" % filename) - _RestoreFilters() - return - - # Note, if no dot is found, this will give the entire filename as the ext. - file_extension = filename[filename.rfind('.') + 1:] - - # When reading from stdin, the extension is unknown, so no cpplint tests - # should rely on the extension. - if filename != '-' and file_extension not in _valid_extensions: - sys.stderr.write('Ignoring %s; not a valid file name ' - '(%s)\n' % (filename, ', '.join(_valid_extensions))) - else: - ProcessFileData(filename, file_extension, lines, Error, - extra_check_functions) - - # If end-of-line sequences are a mix of LF and CR-LF, issue - # warnings on the lines with CR. - # - # Don't issue any warnings if all lines are uniformly LF or CR-LF, - # since critique can handle these just fine, and the style guide - # doesn't dictate a particular end of line sequence. - # - # We can't depend on os.linesep to determine what the desired - # end-of-line sequence should be, since that will return the - # server-side end-of-line sequence. - if lf_lines and crlf_lines: - # Warn on every line with CR. An alternative approach might be to - # check whether the file is mostly CRLF or just LF, and warn on the - # minority, we bias toward LF here since most tools prefer LF. - for linenum in crlf_lines: - Error(filename, linenum, 'whitespace/newline', 1, - 'Unexpected \\r (^M) found; better to use only \\n') + ProcessFileData(filename, file_extension, lines, Error, + extra_check_functions) - sys.stdout.write('Done processing %s\n' % filename) - _RestoreFilters() + # If end-of-line sequences are a mix of LF and CR-LF, issue + # warnings on the lines with CR. + # + # Don't issue any warnings if all lines are uniformly LF or CR-LF, + # since critique can handle these just fine, and the style guide + # doesn't dictate a particular end of line sequence. + # + # We can't depend on os.linesep to determine what the desired + # end-of-line sequence should be, since that will return the + # server-side end-of-line sequence. + if lf_lines and crlf_lines: + # Warn on every line with CR. An alternative approach might be to + # check whether the file is mostly CRLF or just LF, and warn on the + # minority, we bias toward LF here since most tools prefer LF. + for linenum in crlf_lines: + Error(filename, linenum, 'whitespace/newline', 1, + 'Unexpected \\r (^M) found; better to use only \\n') + + sys.stdout.write('Done processing %s\n' % filename) + _RestoreFilters() def PrintUsage(message): - """Prints a brief usage string and exits, optionally with an error message. + """Prints a brief usage string and exits, optionally with an error message. Args: message: The optional error message. """ - sys.stderr.write(_USAGE) - if message: - sys.exit('\nFATAL ERROR: ' + message) - else: - sys.exit(1) + sys.stderr.write(_USAGE) + if message: + sys.exit('\nFATAL ERROR: ' + message) + else: + sys.exit(1) def PrintCategories(): - """Prints a list of all the error-categories used by error messages. + """Prints a list of all the error-categories used by error messages. These are the categories used to filter messages via --filter. """ - sys.stderr.write(''.join(' %s\n' % cat for cat in _ERROR_CATEGORIES)) - sys.exit(0) + sys.stderr.write(''.join(' %s\n' % cat for cat in _ERROR_CATEGORIES)) + sys.exit(0) def ParseArguments(args): - """Parses the command line arguments. + """Parses the command line arguments. This may set the output format and verbosity level as side-effects. @@ -6251,82 +6334,82 @@ def ParseArguments(args): Returns: The list of filenames to lint. """ - try: - (opts, filenames) = getopt.getopt(args, '', ['help', 'output=', 'verbose=', - 'counting=', - 'filter=', - 'root=', - 'linelength=', - 'extensions=']) - except getopt.GetoptError: - PrintUsage('Invalid arguments.') - - verbosity = _VerboseLevel() - output_format = _OutputFormat() - filters = '' - counting_style = '' - - for (opt, val) in opts: - if opt == '--help': - PrintUsage(None) - elif opt == '--output': - if val not in ('emacs', 'vs7', 'eclipse'): - PrintUsage('The only allowed output formats are emacs, vs7 and eclipse.') - output_format = val - elif opt == '--verbose': - verbosity = int(val) - elif opt == '--filter': - filters = val - if not filters: - PrintCategories() - elif opt == '--counting': - if val not in ('total', 'toplevel', 'detailed'): - PrintUsage('Valid counting options are total, toplevel, and detailed') - counting_style = val - elif opt == '--root': - global _root - _root = val - elif opt == '--linelength': - global _line_length - try: - _line_length = int(val) - except ValueError: - PrintUsage('Line length must be digits.') - elif opt == '--extensions': - global _valid_extensions - try: - _valid_extensions = set(val.split(',')) - except ValueError: - PrintUsage('Extensions must be comma seperated list.') - - if not filenames: - PrintUsage('No files were specified.') - - _SetOutputFormat(output_format) - _SetVerboseLevel(verbosity) - _SetFilters(filters) - _SetCountingStyle(counting_style) - - return filenames + try: + (opts, filenames) = getopt.getopt(args, '', [ + 'help', 'output=', 'verbose=', 'counting=', 'filter=', 'root=', + 'linelength=', 'extensions=' + ]) + except getopt.GetoptError: + PrintUsage('Invalid arguments.') + + verbosity = _VerboseLevel() + output_format = _OutputFormat() + filters = '' + counting_style = '' + + for (opt, val) in opts: + if opt == '--help': + PrintUsage(None) + elif opt == '--output': + if val not in ('emacs', 'vs7', 'eclipse'): + PrintUsage( + 'The only allowed output formats are emacs, vs7 and eclipse.' + ) + output_format = val + elif opt == '--verbose': + verbosity = int(val) + elif opt == '--filter': + filters = val + if not filters: + PrintCategories() + elif opt == '--counting': + if val not in ('total', 'toplevel', 'detailed'): + PrintUsage( + 'Valid counting options are total, toplevel, and detailed') + counting_style = val + elif opt == '--root': + global _root + _root = val + elif opt == '--linelength': + global _line_length + try: + _line_length = int(val) + except ValueError: + PrintUsage('Line length must be digits.') + elif opt == '--extensions': + global _valid_extensions + try: + _valid_extensions = set(val.split(',')) + except ValueError: + PrintUsage('Extensions must be comma seperated list.') + + if not filenames: + PrintUsage('No files were specified.') + + _SetOutputFormat(output_format) + _SetVerboseLevel(verbosity) + _SetFilters(filters) + _SetCountingStyle(counting_style) + + return filenames def main(): - filenames = ParseArguments(sys.argv[1:]) + filenames = ParseArguments(sys.argv[1:]) - # Change stderr to write with replacement characters so we don't die - # if we try to print something containing non-ASCII characters. - sys.stderr = codecs.StreamReaderWriter(sys.stderr, - codecs.getreader('utf8'), - codecs.getwriter('utf8'), - 'replace') + # Change stderr to write with replacement characters so we don't die + # if we try to print something containing non-ASCII characters. + sys.stderr = codecs.StreamReaderWriter(sys.stderr, + codecs.getreader('utf8'), + codecs.getwriter('utf8'), 'replace') - _cpplint_state.ResetErrorCounts() - for filename in filenames: - ProcessFile(filename, _cpplint_state.verbose_level) - _cpplint_state.PrintErrorCounts() + _cpplint_state.ResetErrorCounts() + for filename in filenames: + ProcessFile(filename, _cpplint_state.verbose_level) + _cpplint_state.PrintErrorCounts() - sys.exit(_cpplint_state.error_count > 0) + sys.exit(_cpplint_state.error_count > 0) if __name__ == '__main__': - main() + main() diff --git a/paddle/scripts/deb/build_scripts/build.sh b/paddle/scripts/deb/build_scripts/build.sh index 662d2a9103f7da62d96650f490688d02b2c4669e..66a1cfb247dad0292c0832046fb121d14b15b5ba 100755 --- a/paddle/scripts/deb/build_scripts/build.sh +++ b/paddle/scripts/deb/build_scripts/build.sh @@ -33,5 +33,3 @@ cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON -DWITH_AVX=OFF -DCUDNN_ROOT=/usr/ make -j `nproc` cpack -D CPACK_GENERATOR='DEB' .. mv *.deb ~/dist/gpu-noavx - - diff --git a/paddle/scripts/docker/generate.sh b/paddle/scripts/docker/generate.sh index 8a50aefd34955fcc54c25c833154997472449f93..2ad7527db127f3bd2018a7a1f5b40dacfecca6da 100644 --- a/paddle/scripts/docker/generate.sh +++ b/paddle/scripts/docker/generate.sh @@ -58,4 +58,3 @@ m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=ON \ -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \ -DPADDLE_WITH_AVX=OFF \ Dockerfile.m4 > Dockerfile.gpu-noavx-demo - diff --git a/paddle/scripts/travis/common.sh b/paddle/scripts/travis/common.sh index 37e27d665b12fb5b3b8ec7ad245d4587cb0361d6..9b6e420ca7931f0d17da461c7579bf4dc69e18e0 100755 --- a/paddle/scripts/travis/common.sh +++ b/paddle/scripts/travis/common.sh @@ -2,4 +2,3 @@ set -e mkdir -p ../../../build cd ../../../build - diff --git a/paddle/trainer/tests/test.txt b/paddle/trainer/tests/test.txt index 68e7f72e3d8d4ee4e592309cfc230fad24a810d4..3ad503b34f2e1a84c632d0894f180b5cf9ac550a 100644 --- a/paddle/trainer/tests/test.txt +++ b/paddle/trainer/tests/test.txt @@ -998,4 +998,3 @@ from IN B-PP Friday NNP B-NP 's POS B-NP Tokyo NNP I-NP - diff --git a/paddle/trainer/tests/test_gen_dict.txt b/paddle/trainer/tests/test_gen_dict.txt index 91a84146180e0135b37ee8c76508e588412c2870..1000f90057824bf665b32fe47a7f78e7a0077e7b 100644 --- a/paddle/trainer/tests/test_gen_dict.txt +++ b/paddle/trainer/tests/test_gen_dict.txt @@ -6,4 +6,4 @@ 5 6 7 -8 \ No newline at end of file +8 diff --git a/paddle/trainer/tests/train.txt b/paddle/trainer/tests/train.txt index 8d9b15dcf5bba477c2c6f732806a47b0aa6e098a..2313aee987ba71ba7ea779d3cf7705478e7fbde2 100644 --- a/paddle/trainer/tests/train.txt +++ b/paddle/trainer/tests/train.txt @@ -4998,4 +4998,3 @@ However RB B-ADVP the DT B-NP disclosure NN I-NP of IN B-PP - diff --git a/paddle/utils/tests/test_CommandLineParser.cpp b/paddle/utils/tests/test_CommandLineParser.cpp index d5f6018864cb9f14dd3006facebf82c66e909736..9bb6827540f61e8c6cc8b64c2b04ed4d0fcebab1 100644 --- a/paddle/utils/tests/test_CommandLineParser.cpp +++ b/paddle/utils/tests/test_CommandLineParser.cpp @@ -109,4 +109,3 @@ int main(int argc, char** argv) { } #endif - diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 881f0b821491be12e57be0fef04a38fc95fca4eb..dbe2f3b29278c259945564959690a3aa6c0cfbe0 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -410,8 +410,8 @@ def RecurrentLayerGroupEnd(name): "RecurrentLayerGroup not begin") for pair in g_current_submodel.memories: #check exist layer = g_layer_map[pair.layer_name] - config_assert(layer is not None, "memory declare wrong name:%s" % - pair.layer_name) + config_assert(layer is not None, + "memory declare wrong name:%s" % pair.layer_name) memory_link = g_layer_map[pair.link_name] config_assert(layer.size == memory_link.size, "memory declare wrong size:%d" % memory_link.size) @@ -592,6 +592,20 @@ class DotMulProjection(Projection): def calc_parameter_dims(self, input_size, output_size): return [1, output_size] +# ScalingProjection +@config_class +class ScalingProjection(Projection): + type = 'scaling' + + def calc_output_size(self, input_layer_config): + return input_layer_config.size + + def calc_parameter_size(self, input_size, output_size): + return 1 + + def calc_parameter_dims(self, input_size, output_size): + return [1, 1] + @config_class class TableProjection(Projection): @@ -672,8 +686,8 @@ class ConvProjection(Projection): parse_conv(conv_conf, input_layer_name, self.proj_conf.conv_conf, num_filters) # TODO: support rectangle input - self.proj_conf.output_size = (self.proj_conf.conv_conf.output_x** - 2) * num_filters + self.proj_conf.output_size = (self.proj_conf.conv_conf.output_x + **2) * num_filters def calc_output_size(self, input_layer_config): return self.proj_conf.output_size @@ -2779,8 +2793,8 @@ class ConcatenateLayer2(LayerBase): @config_layer('recurrent') class RecurrentLayer(LayerBase): def __init__(self, name, inputs, reversed=False, bias=True, **xargs): - super(RecurrentLayer, self).__init__(name, 'recurrent', 0, inputs, ** - xargs) + super(RecurrentLayer, self).__init__(name, 'recurrent', 0, inputs, + **xargs) config_assert(len(self.inputs) == 1, 'RecurrentLayer must have 1 input') input_layer = self.get_input_layer(0) size = input_layer.size @@ -2862,22 +2876,22 @@ class MDLstmLayer(LayerBase): active_state_type="sigmoid", bias=True, **xargs): - super(MDLstmLayer, self).__init__(name, 'mdlstmemory', 0, inputs, ** - xargs) + super(MDLstmLayer, self).__init__(name, 'mdlstmemory', 0, inputs, + **xargs) config_assert(len(self.inputs) == 1, 'MDLstmLayer must have 1 input') input_layer = self.get_input_layer(0) dim_num = len(directions) #check input_layer.size is divided by (3+dim_num) - config_assert(input_layer.size % - (3 + dim_num) == 0, "size % (dim_num) should be 0!") + config_assert(input_layer.size % (3 + dim_num) == 0, + "size % (dim_num) should be 0!") size = input_layer.size / (3 + dim_num) self.set_layer_size(size) self.config.active_gate_type = active_gate_type self.config.active_state_type = active_state_type for i in xrange(len(directions)): self.config.directions.append(int(directions[i])) - self.create_input_parameter(0, size * size * - (3 + dim_num), [size, size, 3 + dim_num]) + self.create_input_parameter(0, size * size * (3 + dim_num), + [size, size, 3 + dim_num]) #bias includes 3 kinds of peephole, 3+dim_num+2+dim_num self.create_bias_parameter(bias, size * (5 + 2 * dim_num)) @@ -2915,8 +2929,8 @@ class GruStepLayer(LayerBase): active_gate_type="sigmoid", bias=True, **xargs): - super(GruStepLayer, self).__init__(name, 'gru_step', size, inputs, ** - xargs) + super(GruStepLayer, self).__init__(name, 'gru_step', size, inputs, + **xargs) config_assert(len(self.inputs) == 2, 'GruStepLayer must have 2 input') input_layer0 = self.get_input_layer(0) input_layer1 = self.get_input_layer(1) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 952b1f097133c68643cb201fa30452e9e08afcd8..f0757b9ce21fc1f3bfbf72c4733e5c0ccdfebec0 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -65,6 +65,7 @@ __all__ = [ 'StaticInput', 'expand_layer', 'scaling_layer', + 'scaling_projection', 'power_layer', 'interpolation_layer', 'bilinear_interp_layer', @@ -458,7 +459,7 @@ def identity_projection(input, offset=None): :type input: LayerOutput :param offset: Offset, None if use default. :type offset: int - :return: A IdentityProjection or IdentityOffsetProjection Object + :return: A IdentityProjection or IdentityOffsetProjection object :rtype: IdentityProjection or IdentityOffsetProjection """ if offset is None: @@ -471,6 +472,34 @@ def identity_projection(input, offset=None): return proj +@wrap_param_attr_default() +def scaling_projection(input, param_attr=None): + """ + scaling_projection multiplies the input with a scalar parameter and add to + the output. + + .. math:: + out += w * in + + The example usage is: + + .. code-block:: python + + proj = scaling_projection(input=layer) + + :param input: Input Layer. + :type input: LayerOutput + :param param_attr: Parameter config, None if use default. + :type param_attr: ParameterAttribute + :return: A ScalingProjection object + :rtype: ScalingProjection + """ + proj = ScalingProjection(input_layer_name=input.name, + **param_attr.attr) + proj.origin = input + return proj + + @wrap_param_attr_default() def dotmul_projection(input, param_attr=None): """ @@ -1426,11 +1455,11 @@ def bilinear_interp_layer(input, .. code-block:: python bilinear = bilinear_interp_layer(input=layer1, out_size_x=64, out_size_y=64) - + :param input: A input layer. :type input: LayerOutput. :param out_size_x: bilinear interpolation output width. - :type out_size_x: int|None + :type out_size_x: int|None :param out_size_y: bilinear interpolation output height. :type out_size_y: int|None :param name: The layer's name, which cna not be specified. @@ -1742,11 +1771,11 @@ def img_conv_layer(input, The details of convolution layer, please refer UFLDL's `convolution `_ . - - Convolution Transpose (deconv) layer for image. Paddle only support square + + Convolution Transpose (deconv) layer for image. Paddle only support square input currently and thus input image's width equals height. - The details of convolution transpose layer, + The details of convolution transpose layer, please refer to the following explanation and references therein `_ . @@ -4411,7 +4440,7 @@ def cross_entropy(input, label, name=None, coeff=1.0, layer_attr=None): .. code-block:: python - cost = cross_entropy(input=input_layer, + cost = cross_entropy(input=input_layer, label=label_layer) :param input: The first input layer. @@ -4451,7 +4480,7 @@ def cross_entropy_with_selfnorm(input, .. code-block:: python - cost = cross_entropy_with_selfnorm(input=input_layer, + cost = cross_entropy_with_selfnorm(input=input_layer, label=label_layer) :param input: The first input layer. @@ -4521,7 +4550,7 @@ def huber_cost(input, label, name=None, coeff=1.0, layer_attr=None): .. code-block:: python - cost = huber_cost(input=input_layer, + cost = huber_cost(input=input_layer, label=label_layer) :param input: The first input layer. @@ -4561,7 +4590,7 @@ def multi_binary_label_cross_entropy(input, .. code-block:: python - cost = multi_binary_label_cross_entropy(input=input_layer, + cost = multi_binary_label_cross_entropy(input=input_layer, label=label_layer) :param input: The first input layer. diff --git a/python/paddle/trainer_config_helpers/tests/configs/projections.py b/python/paddle/trainer_config_helpers/tests/configs/projections.py index 19ac6ec9061d67fa73e10e95e15fde5322b00503..aa4521dcd5db3f845871cfaaedb02a86bcbddc38 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/projections.py +++ b/python/paddle/trainer_config_helpers/tests/configs/projections.py @@ -26,6 +26,7 @@ with mixed_layer() as m5: with mixed_layer() as m6: m6 += dotmul_operator(a=m3, b=m4) + m6 += scaling_projection(m3) img = data_layer(name='img', size=32 * 32) flt = data_layer(name='filter', size=3 * 3 * 1 * 64) diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr index e47e531a2223ddaa9dd1dfaf1fcee8a11008cbbd..2b3951c242411e0c0990a52bcb2ae6b1723a9367 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr @@ -111,13 +111,23 @@ layers { inputs { input_layer_name: "__mixed_2__" } + inputs { + input_layer_name: "__mixed_2__" + input_parameter_name: "___mixed_5__.w1" + proj_conf { + type: "scaling" + name: "___mixed_5__.w1" + input_size: 100 + output_size: 100 + } + } inputs { input_layer_name: "__mixed_3__" } operator_confs { type: "dot_mul" input_indices: 0 - input_indices: 1 + input_indices: 2 input_sizes: 100 input_sizes: 100 output_size: 100 @@ -258,6 +268,16 @@ parameters { initial_strategy: 0 initial_smart: false } +parameters { + name: "___mixed_5__.w1" + size: 1 + initial_mean: 0.0 + initial_std: 1.0 + dims: 1 + dims: 1 + initial_strategy: 0 + initial_smart: true +} parameters { name: "___mixed_7__.w0" size: 30000 diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py index 4f3c9434efbd80e130ecbb4d2cd3b056555b3af3..3e93f41c2e32025b3e29a0990833d7e97a7c8caa 100644 --- a/python/paddle/utils/__init__.py +++ b/python/paddle/utils/__init__.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__all__ = ['dump_config'] \ No newline at end of file +__all__ = ['dump_config']