diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index 9898dc083ebb1783a0e2ddd12afaa9c3d5a79e98..47ca1833967ee705d6558b1dad06a6335b30f03a 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -8,6 +8,7 @@ add_subdirectory(gserver) add_subdirectory(pserver) add_subdirectory(trainer) add_subdirectory(scripts) +add_subdirectory(strings) # Do not build go directory until go cmake is working smoothly. # if(CMAKE_Go_COMPILER) diff --git a/paddle/strings/CMakeLists.txt b/paddle/strings/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e55eecd484c0e218ecd51bbd19b3eb4f6f92a25 --- /dev/null +++ b/paddle/strings/CMakeLists.txt @@ -0,0 +1,2 @@ +cc_library(stringpiece SRCS stringpiece.cc) +cc_test(stringpiece_test SRCS stringpiece_test.cc DEPS stringpiece glog gflags) diff --git a/paddle/strings/stringpiece.cc b/paddle/strings/stringpiece.cc new file mode 100644 index 0000000000000000000000000000000000000000..ef46490fb1aaecbc8450b183fa4919f64fb97ba4 --- /dev/null +++ b/paddle/strings/stringpiece.cc @@ -0,0 +1,127 @@ +/* + Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "paddle/strings/stringpiece.h" + +#include +#include +#include + +namespace paddle { + +StringPiece::StringPiece() : data_(NULL), size_(0) {} + +StringPiece::StringPiece(const char* d, size_t n) : data_(d), size_(n) { + if (d == NULL && n != 0) + throw std::invalid_argument( + "StringPiece requires len to be 0 for NULL data"); +} + +StringPiece::StringPiece(const char* s) : data_(s), size_(strlen(s)) { + if (s == NULL) size_ = 0; +} + +StringPiece::StringPiece(const std::string& s) + : data_(s.data()), size_(s.size()) {} + +int Compare(StringPiece a, StringPiece b) { + const size_t min_len = (a.len() < b.len()) ? a.len() : b.len(); + int r = memcmp(a.data(), b.data(), min_len); + if (r == 0) { + if (a.len() < b.len()) + return -1; + else if (a.len() > b.len()) + return 1; + } + return r; +} + +bool operator==(StringPiece x, StringPiece y) { + return ((x.len() == y.len()) && + (x.data() == y.data() || memcmp(x.data(), y.data(), x.len()) == 0)); +} + +bool operator!=(StringPiece x, StringPiece y) { return !(x == y); } + +bool operator<(StringPiece x, StringPiece y) { return Compare(x, y) < 0; } +bool operator>(StringPiece x, StringPiece y) { return Compare(x, y) > 0; } + +bool operator<=(StringPiece x, StringPiece y) { return Compare(x, y) <= 0; } +bool operator>=(StringPiece x, StringPiece y) { return Compare(x, y) >= 0; } + +bool HasPrefix(StringPiece s, StringPiece x) { + return ((s.len() >= x.len()) && (memcmp(s.data(), x.data(), x.len()) == 0)); +} + +bool HasSuffix(StringPiece s, StringPiece x) { + return ((s.len() >= x.len()) && + (memcmp(s.data() + (s.len() - x.len()), x.data(), x.len()) == 0)); +} + +StringPiece SkipPrefix(StringPiece s, size_t n) { + assert(n <= s.len()); + return StringPiece(s.data() + n, s.len() - n); +} + +StringPiece SkipSuffix(StringPiece s, size_t n) { + assert(size_ >= n); + return StringPiece(s.data(), s.len() - n); +} + +StringPiece TrimPrefix(StringPiece s, StringPiece x) { + return HasPrefix(s, x) ? SkipPrefix(s, x.len()) : s; +} + +StringPiece TrimSuffix(StringPiece s, StringPiece x) { + return HasSuffix(s, x) ? SkipSuffix(s, x.len()) : s; +} + +bool Contains(StringPiece s, StringPiece sub) { + return std::search(s.begin(), s.end(), sub.begin(), sub.end()) != s.end(); +} + +size_t Index(StringPiece s, StringPiece sub) { + auto e = std::search(s.begin(), s.end(), sub.begin(), sub.end()); + return e != s.end() ? e - s.data() : StringPiece::npos; +} + +size_t Find(StringPiece s, char c, size_t pos) { + if (pos >= s.len()) { + return StringPiece::npos; + } + const char* result = + reinterpret_cast(memchr(s.data() + pos, c, s.len() - pos)); + return result != nullptr ? result - s.data() : StringPiece::npos; +} + +size_t RFind(StringPiece s, char c, size_t pos) { + if (s.len() == 0) return StringPiece::npos; + for (const char* p = s.data() + std::min(pos, s.len() - 1); p >= s.data(); + p--) { + if (*p == c) { + return p - s.data(); + } + } + return StringPiece::npos; +} + +StringPiece SubStr(StringPiece s, size_t pos, size_t n) { + if (pos > s.len()) pos = s.len(); + if (n > s.len() - pos) n = s.len() - pos; + return StringPiece(s.data() + pos, n); +} + +} // namespace paddle diff --git a/paddle/strings/stringpiece.h b/paddle/strings/stringpiece.h new file mode 100644 index 0000000000000000000000000000000000000000..febb9036dbabba1d180bd1aeedcb665782dcdba9 --- /dev/null +++ b/paddle/strings/stringpiece.h @@ -0,0 +1,119 @@ +/* + Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#pragma once + +#include +#include + +#include +#include + +namespace paddle { + +// StringPiece points into a std::string object but doesn't own the +// string. It is for efficient access to strings. Like Go's string +// type. StringPiece is not thread-safe. +class StringPiece { +public: + static const size_t npos = static_cast(-1); + + // We provide non-explicit singleton constructors so users can + // pass in a "const char*" or a "string" wherever a "StringPiece" + // is expected. These contructors ensure that if data_ is NULL, + // size_ is 0. + StringPiece(); + StringPiece(const char* d, size_t n); + StringPiece(const char* s); + StringPiece(const std::string& s); + + // For a string, cap() returns the size of storage and len() + // returns the currently used storage. For a StringPiece these + // are the same value. + size_t cap() const { return size_; } + size_t len() const { return size_; } + size_t size() const { return size_; } + + const char* data() const { return data_; } + + char operator[](size_t n) const { + assert(n < len()); + return data_[n]; + } + + // StringPiece doesn't own the string, so both iterator and const + // iterator are const char* indeed. + typedef const char* const_iterator; + typedef const char* iterator; + iterator begin() const { return data_; } + iterator end() const { return data_ + size_; } + + struct Hasher { + size_t operator()(StringPiece arg) const; + }; + + // Return a string that contains the copy of the referenced data. + std::string ToString() const { return std::string(data_, size_); } + +private: + const char* data_; + size_t size_; + + // Intentionally copyable +}; + +// Because StringPiece contains a little data members, and without the +// ownership, it is so cheap to pass StringPieces around, we don't +// need to define parrameters of the following operations to be +// references. Also, it is cheap to construct new StringPieces, so we +// don't define mutative operations as member functions. + +int Compare(StringPiece a, StringPiece b); + +bool operator==(StringPiece x, StringPiece y); +bool operator!=(StringPiece x, StringPiece y); +bool operator<(StringPiece x, StringPiece y); +bool operator>(StringPiece x, StringPiece y); +bool operator<=(StringPiece x, StringPiece y); +bool operator>=(StringPiece x, StringPiece y); + +bool HasPrefix(StringPiece s, StringPiece prefix); +bool HasSuffix(StringPiece s, StringPiece suffix); + +StringPiece SkipPrefix(StringPiece s, size_t n); +StringPiece SkipSuffix(StringPiece s, size_t n); + +// Skip the prefix (or suffix) if it matches with the string. +StringPiece TrimPrefix(StringPiece s, StringPiece prefix); +StringPiece TrimSuffix(StringPiece s, StringPiece suffix); + +bool Contains(StringPiece s, StringPiece sub); + +// Return the first occurrence of sub in s, or npos. +size_t Index(StringPiece s, StringPiece sub); + +// Return the first occurrence of c in s[pos:end], or npos. +size_t Find(StringPiece s, char c, size_t pos); + +// Search range is [0..pos] inclusive. If pos == npos, search everything. +size_t RFind(StringPiece s, char c, size_t pos); + +StringPiece SubStr(StringPiece s, size_t pos, size_t n); + +// allow StringPiece to be logged +extern std::ostream& operator<<(std::ostream& o, StringPiece piece); + +} // namespace paddle diff --git a/paddle/strings/stringpiece_test.cc b/paddle/strings/stringpiece_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..79fc219a9c9056b9dd77ba6caa1de3ff136bb844 --- /dev/null +++ b/paddle/strings/stringpiece_test.cc @@ -0,0 +1,58 @@ +/* + Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "paddle/strings/stringpiece.h" +#include "gtest/gtest.h" + +TEST(StringPiece, Construct) { + { + paddle::StringPiece s; + EXPECT_EQ(NULL, s.data()); + EXPECT_EQ(0U, s.len()); + EXPECT_EQ(0U, s.cap()); + } + { + EXPECT_THROW([] { paddle::StringPiece s(NULL, 10000U); }(), + std::invalid_argument); + } + { + paddle::StringPiece s(NULL); + EXPECT_EQ(0U, s.len()); + } + { + std::string a; + EXPECT_EQ(0U, a.size()); + paddle::StringPiece s(a); + EXPECT_EQ(0U, s.len()); + } +} + +TEST(StringPiece, CopyAndAssign) { + paddle::StringPiece empty; + EXPECT_EQ(0U, empty.len()); + EXPECT_EQ(0U, empty.cap()); + + paddle::StringPiece a("hello"); + paddle::StringPiece b = a; + EXPECT_EQ(b.len(), strlen("hello")); + EXPECT_EQ(b.cap(), strlen("hello")); + EXPECT_EQ(a, b); + + std::string storage("hello"); + paddle::StringPiece c(storage); + EXPECT_EQ(a, c); + EXPECT_NE(a.data(), c.data()); +}